diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,91033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9459705569889207, + "eval_steps": 500, + "global_step": 13000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00030353619669145547, + "grad_norm": 9.667811393737793, + "learning_rate": 1e-05, + "loss": 5.0202, + "step": 1 + }, + { + "epoch": 0.0006070723933829109, + "grad_norm": 10.303421974182129, + "learning_rate": 2e-05, + "loss": 4.7469, + "step": 2 + }, + { + "epoch": 0.0009106085900743664, + "grad_norm": 7.488056182861328, + "learning_rate": 3e-05, + "loss": 5.0105, + "step": 3 + }, + { + "epoch": 0.0012141447867658219, + "grad_norm": 4.885837078094482, + "learning_rate": 4e-05, + "loss": 4.3945, + "step": 4 + }, + { + "epoch": 0.0015176809834572772, + "grad_norm": 3.793656587600708, + "learning_rate": 5e-05, + "loss": 4.0574, + "step": 5 + }, + { + "epoch": 0.0018212171801487327, + "grad_norm": 3.9249916076660156, + "learning_rate": 6e-05, + "loss": 3.8179, + "step": 6 + }, + { + "epoch": 0.002124753376840188, + "grad_norm": 3.4937145709991455, + "learning_rate": 7e-05, + "loss": 3.5297, + "step": 7 + }, + { + "epoch": 0.0024282895735316438, + "grad_norm": 2.499041795730591, + "learning_rate": 8e-05, + "loss": 3.15, + "step": 8 + }, + { + "epoch": 0.002731825770223099, + "grad_norm": 2.0781290531158447, + "learning_rate": 9e-05, + "loss": 2.8658, + "step": 9 + }, + { + "epoch": 0.0030353619669145544, + "grad_norm": 2.0124764442443848, + "learning_rate": 0.0001, + "loss": 2.6826, + "step": 10 + }, + { + "epoch": 0.00333889816360601, + "grad_norm": 1.4209256172180176, + "learning_rate": 9.99949377341298e-05, + "loss": 2.5608, + "step": 11 + }, + { + "epoch": 0.0036424343602974654, + "grad_norm": 3.176084041595459, + "learning_rate": 9.99898754682596e-05, + "loss": 2.2416, + "step": 12 + }, + { + "epoch": 0.003945970556988921, + "grad_norm": 1.4457614421844482, + "learning_rate": 9.998481320238939e-05, + "loss": 2.1925, + "step": 13 + }, + { + "epoch": 0.004249506753680376, + "grad_norm": 1.3989348411560059, + "learning_rate": 9.997975093651918e-05, + "loss": 2.2165, + "step": 14 + }, + { + "epoch": 0.004553042950371832, + "grad_norm": 1.0647027492523193, + "learning_rate": 9.997468867064899e-05, + "loss": 2.3486, + "step": 15 + }, + { + "epoch": 0.0048565791470632875, + "grad_norm": 1.0246940851211548, + "learning_rate": 9.996962640477879e-05, + "loss": 2.19, + "step": 16 + }, + { + "epoch": 0.005160115343754742, + "grad_norm": 1.029646873474121, + "learning_rate": 9.996456413890858e-05, + "loss": 2.4052, + "step": 17 + }, + { + "epoch": 0.005463651540446198, + "grad_norm": 1.322654128074646, + "learning_rate": 9.995950187303838e-05, + "loss": 2.1927, + "step": 18 + }, + { + "epoch": 0.005767187737137654, + "grad_norm": 2.061326026916504, + "learning_rate": 9.995443960716817e-05, + "loss": 2.4574, + "step": 19 + }, + { + "epoch": 0.006070723933829109, + "grad_norm": 1.1343607902526855, + "learning_rate": 9.994937734129797e-05, + "loss": 1.9598, + "step": 20 + }, + { + "epoch": 0.0063742601305205645, + "grad_norm": 1.13712477684021, + "learning_rate": 9.994431507542776e-05, + "loss": 2.8643, + "step": 21 + }, + { + "epoch": 0.00667779632721202, + "grad_norm": 0.8220421671867371, + "learning_rate": 9.993925280955756e-05, + "loss": 2.0474, + "step": 22 + }, + { + "epoch": 0.006981332523903475, + "grad_norm": 0.8233473300933838, + "learning_rate": 9.993419054368735e-05, + "loss": 2.3597, + "step": 23 + }, + { + "epoch": 0.007284868720594931, + "grad_norm": 0.8661925196647644, + "learning_rate": 9.992912827781716e-05, + "loss": 2.2163, + "step": 24 + }, + { + "epoch": 0.007588404917286387, + "grad_norm": 0.7995729446411133, + "learning_rate": 9.992406601194695e-05, + "loss": 1.8051, + "step": 25 + }, + { + "epoch": 0.007891941113977842, + "grad_norm": 0.810165286064148, + "learning_rate": 9.991900374607675e-05, + "loss": 1.9189, + "step": 26 + }, + { + "epoch": 0.008195477310669297, + "grad_norm": 0.8240752220153809, + "learning_rate": 9.991394148020654e-05, + "loss": 1.7, + "step": 27 + }, + { + "epoch": 0.008499013507360752, + "grad_norm": 1.0160635709762573, + "learning_rate": 9.990887921433634e-05, + "loss": 2.2964, + "step": 28 + }, + { + "epoch": 0.008802549704052209, + "grad_norm": 0.794966995716095, + "learning_rate": 9.990381694846613e-05, + "loss": 1.7333, + "step": 29 + }, + { + "epoch": 0.009106085900743664, + "grad_norm": 0.5594797134399414, + "learning_rate": 9.989875468259593e-05, + "loss": 2.0925, + "step": 30 + }, + { + "epoch": 0.009409622097435118, + "grad_norm": 0.8100740909576416, + "learning_rate": 9.989369241672572e-05, + "loss": 2.1218, + "step": 31 + }, + { + "epoch": 0.009713158294126575, + "grad_norm": 0.7057996392250061, + "learning_rate": 9.988863015085552e-05, + "loss": 2.005, + "step": 32 + }, + { + "epoch": 0.01001669449081803, + "grad_norm": 0.8970999121665955, + "learning_rate": 9.988356788498533e-05, + "loss": 2.2414, + "step": 33 + }, + { + "epoch": 0.010320230687509485, + "grad_norm": 0.6290627717971802, + "learning_rate": 9.987850561911512e-05, + "loss": 2.2422, + "step": 34 + }, + { + "epoch": 0.010623766884200941, + "grad_norm": 0.5665722489356995, + "learning_rate": 9.987344335324492e-05, + "loss": 1.9342, + "step": 35 + }, + { + "epoch": 0.010927303080892396, + "grad_norm": 0.5792561173439026, + "learning_rate": 9.986838108737472e-05, + "loss": 1.8733, + "step": 36 + }, + { + "epoch": 0.011230839277583851, + "grad_norm": 0.5264159440994263, + "learning_rate": 9.986331882150452e-05, + "loss": 2.1739, + "step": 37 + }, + { + "epoch": 0.011534375474275308, + "grad_norm": 0.5069584250450134, + "learning_rate": 9.985825655563431e-05, + "loss": 1.6235, + "step": 38 + }, + { + "epoch": 0.011837911670966763, + "grad_norm": 0.7689110636711121, + "learning_rate": 9.985319428976411e-05, + "loss": 1.711, + "step": 39 + }, + { + "epoch": 0.012141447867658217, + "grad_norm": 0.7001574635505676, + "learning_rate": 9.98481320238939e-05, + "loss": 1.651, + "step": 40 + }, + { + "epoch": 0.012444984064349674, + "grad_norm": 0.5615801811218262, + "learning_rate": 9.98430697580237e-05, + "loss": 2.128, + "step": 41 + }, + { + "epoch": 0.012748520261041129, + "grad_norm": 0.8766308426856995, + "learning_rate": 9.983800749215349e-05, + "loss": 2.4421, + "step": 42 + }, + { + "epoch": 0.013052056457732584, + "grad_norm": 0.704547107219696, + "learning_rate": 9.983294522628329e-05, + "loss": 1.6921, + "step": 43 + }, + { + "epoch": 0.01335559265442404, + "grad_norm": 0.5749143362045288, + "learning_rate": 9.982788296041308e-05, + "loss": 2.0173, + "step": 44 + }, + { + "epoch": 0.013659128851115495, + "grad_norm": 0.7929263710975647, + "learning_rate": 9.982282069454289e-05, + "loss": 2.1755, + "step": 45 + }, + { + "epoch": 0.01396266504780695, + "grad_norm": 1.6391934156417847, + "learning_rate": 9.981775842867269e-05, + "loss": 2.4995, + "step": 46 + }, + { + "epoch": 0.014266201244498407, + "grad_norm": 0.49616461992263794, + "learning_rate": 9.981269616280248e-05, + "loss": 2.3363, + "step": 47 + }, + { + "epoch": 0.014569737441189862, + "grad_norm": 0.614272952079773, + "learning_rate": 9.980763389693227e-05, + "loss": 2.0277, + "step": 48 + }, + { + "epoch": 0.014873273637881317, + "grad_norm": 0.6181132197380066, + "learning_rate": 9.980257163106207e-05, + "loss": 2.2867, + "step": 49 + }, + { + "epoch": 0.015176809834572773, + "grad_norm": 0.5342630743980408, + "learning_rate": 9.979750936519186e-05, + "loss": 1.7314, + "step": 50 + }, + { + "epoch": 0.015480346031264228, + "grad_norm": 0.4582519233226776, + "learning_rate": 9.979244709932166e-05, + "loss": 1.9893, + "step": 51 + }, + { + "epoch": 0.015783882227955685, + "grad_norm": 0.5448606014251709, + "learning_rate": 9.978738483345145e-05, + "loss": 2.3266, + "step": 52 + }, + { + "epoch": 0.01608741842464714, + "grad_norm": 1.0823545455932617, + "learning_rate": 9.978232256758125e-05, + "loss": 2.1919, + "step": 53 + }, + { + "epoch": 0.016390954621338594, + "grad_norm": 0.5506464838981628, + "learning_rate": 9.977726030171106e-05, + "loss": 2.0735, + "step": 54 + }, + { + "epoch": 0.01669449081803005, + "grad_norm": 0.568626344203949, + "learning_rate": 9.977219803584085e-05, + "loss": 2.051, + "step": 55 + }, + { + "epoch": 0.016998027014721504, + "grad_norm": 0.512907087802887, + "learning_rate": 9.976713576997065e-05, + "loss": 1.6473, + "step": 56 + }, + { + "epoch": 0.017301563211412962, + "grad_norm": 0.5541898012161255, + "learning_rate": 9.976207350410044e-05, + "loss": 1.8184, + "step": 57 + }, + { + "epoch": 0.017605099408104417, + "grad_norm": 0.5083638429641724, + "learning_rate": 9.975701123823024e-05, + "loss": 1.7573, + "step": 58 + }, + { + "epoch": 0.017908635604795872, + "grad_norm": 0.4722895920276642, + "learning_rate": 9.975194897236003e-05, + "loss": 2.0311, + "step": 59 + }, + { + "epoch": 0.018212171801487327, + "grad_norm": 0.5068002343177795, + "learning_rate": 9.974688670648983e-05, + "loss": 2.1245, + "step": 60 + }, + { + "epoch": 0.018515707998178782, + "grad_norm": 0.5726852416992188, + "learning_rate": 9.974182444061962e-05, + "loss": 2.1017, + "step": 61 + }, + { + "epoch": 0.018819244194870237, + "grad_norm": 0.5240160226821899, + "learning_rate": 9.973676217474942e-05, + "loss": 2.2665, + "step": 62 + }, + { + "epoch": 0.019122780391561695, + "grad_norm": 0.4728144705295563, + "learning_rate": 9.973169990887921e-05, + "loss": 2.0537, + "step": 63 + }, + { + "epoch": 0.01942631658825315, + "grad_norm": 0.47115418314933777, + "learning_rate": 9.972663764300902e-05, + "loss": 1.2815, + "step": 64 + }, + { + "epoch": 0.019729852784944605, + "grad_norm": 0.7070208191871643, + "learning_rate": 9.972157537713881e-05, + "loss": 1.8514, + "step": 65 + }, + { + "epoch": 0.02003338898163606, + "grad_norm": 0.529069185256958, + "learning_rate": 9.971651311126861e-05, + "loss": 1.7602, + "step": 66 + }, + { + "epoch": 0.020336925178327515, + "grad_norm": 0.7532087564468384, + "learning_rate": 9.97114508453984e-05, + "loss": 2.2168, + "step": 67 + }, + { + "epoch": 0.02064046137501897, + "grad_norm": 0.5654622912406921, + "learning_rate": 9.97063885795282e-05, + "loss": 1.9634, + "step": 68 + }, + { + "epoch": 0.020943997571710428, + "grad_norm": 0.701452910900116, + "learning_rate": 9.970132631365799e-05, + "loss": 2.044, + "step": 69 + }, + { + "epoch": 0.021247533768401883, + "grad_norm": 0.5750812888145447, + "learning_rate": 9.969626404778779e-05, + "loss": 1.8015, + "step": 70 + }, + { + "epoch": 0.021551069965093338, + "grad_norm": 0.49930402636528015, + "learning_rate": 9.969120178191758e-05, + "loss": 1.7998, + "step": 71 + }, + { + "epoch": 0.021854606161784793, + "grad_norm": 0.4348014295101166, + "learning_rate": 9.968613951604738e-05, + "loss": 1.9959, + "step": 72 + }, + { + "epoch": 0.022158142358476247, + "grad_norm": 0.5268503427505493, + "learning_rate": 9.968107725017719e-05, + "loss": 1.8497, + "step": 73 + }, + { + "epoch": 0.022461678555167702, + "grad_norm": 0.578822135925293, + "learning_rate": 9.967601498430698e-05, + "loss": 2.3277, + "step": 74 + }, + { + "epoch": 0.02276521475185916, + "grad_norm": 0.52215975522995, + "learning_rate": 9.967095271843677e-05, + "loss": 2.1179, + "step": 75 + }, + { + "epoch": 0.023068750948550616, + "grad_norm": 0.4557477533817291, + "learning_rate": 9.966589045256657e-05, + "loss": 2.0132, + "step": 76 + }, + { + "epoch": 0.02337228714524207, + "grad_norm": 0.5032123327255249, + "learning_rate": 9.966082818669638e-05, + "loss": 1.8608, + "step": 77 + }, + { + "epoch": 0.023675823341933525, + "grad_norm": 0.42689865827560425, + "learning_rate": 9.965576592082617e-05, + "loss": 2.0437, + "step": 78 + }, + { + "epoch": 0.02397935953862498, + "grad_norm": 0.44310206174850464, + "learning_rate": 9.965070365495597e-05, + "loss": 2.1222, + "step": 79 + }, + { + "epoch": 0.024282895735316435, + "grad_norm": 0.4377008378505707, + "learning_rate": 9.964564138908576e-05, + "loss": 2.0418, + "step": 80 + }, + { + "epoch": 0.024586431932007893, + "grad_norm": 0.35174912214279175, + "learning_rate": 9.964057912321556e-05, + "loss": 1.6931, + "step": 81 + }, + { + "epoch": 0.024889968128699348, + "grad_norm": 0.47877687215805054, + "learning_rate": 9.963551685734535e-05, + "loss": 1.7049, + "step": 82 + }, + { + "epoch": 0.025193504325390803, + "grad_norm": 0.4063829183578491, + "learning_rate": 9.963045459147515e-05, + "loss": 1.8611, + "step": 83 + }, + { + "epoch": 0.025497040522082258, + "grad_norm": 0.4149170219898224, + "learning_rate": 9.962539232560496e-05, + "loss": 1.9439, + "step": 84 + }, + { + "epoch": 0.025800576718773713, + "grad_norm": 0.4882602393627167, + "learning_rate": 9.962033005973475e-05, + "loss": 1.5723, + "step": 85 + }, + { + "epoch": 0.026104112915465168, + "grad_norm": 0.4600992202758789, + "learning_rate": 9.961526779386454e-05, + "loss": 2.0142, + "step": 86 + }, + { + "epoch": 0.026407649112156626, + "grad_norm": 0.43366697430610657, + "learning_rate": 9.961020552799434e-05, + "loss": 1.9175, + "step": 87 + }, + { + "epoch": 0.02671118530884808, + "grad_norm": 0.501487135887146, + "learning_rate": 9.960514326212413e-05, + "loss": 1.5043, + "step": 88 + }, + { + "epoch": 0.027014721505539536, + "grad_norm": 0.43821993470191956, + "learning_rate": 9.960008099625393e-05, + "loss": 1.8622, + "step": 89 + }, + { + "epoch": 0.02731825770223099, + "grad_norm": 0.4433805048465729, + "learning_rate": 9.959501873038372e-05, + "loss": 1.9459, + "step": 90 + }, + { + "epoch": 0.027621793898922446, + "grad_norm": 0.4686216115951538, + "learning_rate": 9.958995646451352e-05, + "loss": 1.7405, + "step": 91 + }, + { + "epoch": 0.0279253300956139, + "grad_norm": 0.48586198687553406, + "learning_rate": 9.958489419864331e-05, + "loss": 2.2233, + "step": 92 + }, + { + "epoch": 0.02822886629230536, + "grad_norm": 0.4018734097480774, + "learning_rate": 9.957983193277312e-05, + "loss": 2.0027, + "step": 93 + }, + { + "epoch": 0.028532402488996814, + "grad_norm": 0.4996435344219208, + "learning_rate": 9.957476966690292e-05, + "loss": 1.5949, + "step": 94 + }, + { + "epoch": 0.02883593868568827, + "grad_norm": 0.45447826385498047, + "learning_rate": 9.956970740103271e-05, + "loss": 1.7636, + "step": 95 + }, + { + "epoch": 0.029139474882379723, + "grad_norm": 0.4209904372692108, + "learning_rate": 9.95646451351625e-05, + "loss": 1.7523, + "step": 96 + }, + { + "epoch": 0.029443011079071178, + "grad_norm": 0.3740164637565613, + "learning_rate": 9.95595828692923e-05, + "loss": 1.9136, + "step": 97 + }, + { + "epoch": 0.029746547275762633, + "grad_norm": 0.4169963598251343, + "learning_rate": 9.95545206034221e-05, + "loss": 1.9136, + "step": 98 + }, + { + "epoch": 0.03005008347245409, + "grad_norm": 0.4683006703853607, + "learning_rate": 9.954945833755189e-05, + "loss": 2.0657, + "step": 99 + }, + { + "epoch": 0.030353619669145546, + "grad_norm": 0.4508633017539978, + "learning_rate": 9.954439607168169e-05, + "loss": 2.1099, + "step": 100 + }, + { + "epoch": 0.030657155865837, + "grad_norm": 0.4136218726634979, + "learning_rate": 9.953933380581148e-05, + "loss": 2.0183, + "step": 101 + }, + { + "epoch": 0.030960692062528456, + "grad_norm": 0.44510790705680847, + "learning_rate": 9.953427153994127e-05, + "loss": 1.9307, + "step": 102 + }, + { + "epoch": 0.031264228259219914, + "grad_norm": 0.3713892698287964, + "learning_rate": 9.952920927407108e-05, + "loss": 1.7017, + "step": 103 + }, + { + "epoch": 0.03156776445591137, + "grad_norm": 0.47902294993400574, + "learning_rate": 9.952414700820088e-05, + "loss": 2.1172, + "step": 104 + }, + { + "epoch": 0.031871300652602824, + "grad_norm": 0.4492317736148834, + "learning_rate": 9.951908474233067e-05, + "loss": 1.9752, + "step": 105 + }, + { + "epoch": 0.03217483684929428, + "grad_norm": 0.4096255302429199, + "learning_rate": 9.951402247646047e-05, + "loss": 1.5511, + "step": 106 + }, + { + "epoch": 0.032478373045985734, + "grad_norm": 0.39630818367004395, + "learning_rate": 9.950896021059026e-05, + "loss": 2.11, + "step": 107 + }, + { + "epoch": 0.03278190924267719, + "grad_norm": 0.42648032307624817, + "learning_rate": 9.950389794472006e-05, + "loss": 2.1784, + "step": 108 + }, + { + "epoch": 0.033085445439368644, + "grad_norm": 0.4814178943634033, + "learning_rate": 9.949883567884985e-05, + "loss": 1.955, + "step": 109 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.41600191593170166, + "learning_rate": 9.949377341297965e-05, + "loss": 1.9163, + "step": 110 + }, + { + "epoch": 0.03369251783275155, + "grad_norm": 0.4610773026943207, + "learning_rate": 9.948871114710944e-05, + "loss": 1.7934, + "step": 111 + }, + { + "epoch": 0.03399605402944301, + "grad_norm": 0.43061718344688416, + "learning_rate": 9.948364888123925e-05, + "loss": 1.9278, + "step": 112 + }, + { + "epoch": 0.03429959022613446, + "grad_norm": 0.3907497227191925, + "learning_rate": 9.947858661536904e-05, + "loss": 1.996, + "step": 113 + }, + { + "epoch": 0.034603126422825925, + "grad_norm": 0.3984166383743286, + "learning_rate": 9.947352434949884e-05, + "loss": 1.5936, + "step": 114 + }, + { + "epoch": 0.03490666261951738, + "grad_norm": 0.43406423926353455, + "learning_rate": 9.946846208362863e-05, + "loss": 1.8866, + "step": 115 + }, + { + "epoch": 0.035210198816208835, + "grad_norm": 0.45913639664649963, + "learning_rate": 9.946339981775843e-05, + "loss": 1.972, + "step": 116 + }, + { + "epoch": 0.03551373501290029, + "grad_norm": 0.42077311873435974, + "learning_rate": 9.945833755188822e-05, + "loss": 2.0081, + "step": 117 + }, + { + "epoch": 0.035817271209591744, + "grad_norm": 0.41479435563087463, + "learning_rate": 9.945327528601802e-05, + "loss": 2.0096, + "step": 118 + }, + { + "epoch": 0.0361208074062832, + "grad_norm": 0.35669025778770447, + "learning_rate": 9.944821302014781e-05, + "loss": 2.0074, + "step": 119 + }, + { + "epoch": 0.036424343602974654, + "grad_norm": 0.4088069796562195, + "learning_rate": 9.944315075427761e-05, + "loss": 1.817, + "step": 120 + }, + { + "epoch": 0.03672787979966611, + "grad_norm": 0.49982163310050964, + "learning_rate": 9.943808848840742e-05, + "loss": 1.9218, + "step": 121 + }, + { + "epoch": 0.037031415996357564, + "grad_norm": 0.39924055337905884, + "learning_rate": 9.943302622253721e-05, + "loss": 2.2463, + "step": 122 + }, + { + "epoch": 0.03733495219304902, + "grad_norm": 0.40462177991867065, + "learning_rate": 9.942796395666702e-05, + "loss": 2.0844, + "step": 123 + }, + { + "epoch": 0.037638488389740474, + "grad_norm": 0.43440741300582886, + "learning_rate": 9.942290169079681e-05, + "loss": 1.8808, + "step": 124 + }, + { + "epoch": 0.03794202458643193, + "grad_norm": 0.4029730260372162, + "learning_rate": 9.941783942492661e-05, + "loss": 1.9427, + "step": 125 + }, + { + "epoch": 0.03824556078312339, + "grad_norm": 0.7807103395462036, + "learning_rate": 9.94127771590564e-05, + "loss": 1.9072, + "step": 126 + }, + { + "epoch": 0.038549096979814845, + "grad_norm": 0.5021561980247498, + "learning_rate": 9.94077148931862e-05, + "loss": 2.0582, + "step": 127 + }, + { + "epoch": 0.0388526331765063, + "grad_norm": 0.5161197781562805, + "learning_rate": 9.9402652627316e-05, + "loss": 1.9861, + "step": 128 + }, + { + "epoch": 0.039156169373197755, + "grad_norm": 0.5553935766220093, + "learning_rate": 9.939759036144579e-05, + "loss": 2.1893, + "step": 129 + }, + { + "epoch": 0.03945970556988921, + "grad_norm": 0.4241655170917511, + "learning_rate": 9.939252809557558e-05, + "loss": 1.9722, + "step": 130 + }, + { + "epoch": 0.039763241766580665, + "grad_norm": 0.43290001153945923, + "learning_rate": 9.938746582970538e-05, + "loss": 1.5364, + "step": 131 + }, + { + "epoch": 0.04006677796327212, + "grad_norm": 0.40089091658592224, + "learning_rate": 9.938240356383519e-05, + "loss": 1.9686, + "step": 132 + }, + { + "epoch": 0.040370314159963575, + "grad_norm": 0.4152032434940338, + "learning_rate": 9.937734129796498e-05, + "loss": 1.913, + "step": 133 + }, + { + "epoch": 0.04067385035665503, + "grad_norm": 0.4443211555480957, + "learning_rate": 9.937227903209478e-05, + "loss": 2.2354, + "step": 134 + }, + { + "epoch": 0.040977386553346484, + "grad_norm": 0.41355323791503906, + "learning_rate": 9.936721676622457e-05, + "loss": 2.1055, + "step": 135 + }, + { + "epoch": 0.04128092275003794, + "grad_norm": 0.5837479829788208, + "learning_rate": 9.936215450035437e-05, + "loss": 1.9085, + "step": 136 + }, + { + "epoch": 0.041584458946729394, + "grad_norm": 0.40269389748573303, + "learning_rate": 9.935709223448416e-05, + "loss": 2.0368, + "step": 137 + }, + { + "epoch": 0.041887995143420856, + "grad_norm": 0.5898969769477844, + "learning_rate": 9.935202996861396e-05, + "loss": 1.7933, + "step": 138 + }, + { + "epoch": 0.04219153134011231, + "grad_norm": 0.41117680072784424, + "learning_rate": 9.934696770274375e-05, + "loss": 1.7452, + "step": 139 + }, + { + "epoch": 0.042495067536803766, + "grad_norm": 0.5090368390083313, + "learning_rate": 9.934190543687354e-05, + "loss": 2.0141, + "step": 140 + }, + { + "epoch": 0.04279860373349522, + "grad_norm": 0.4821307957172394, + "learning_rate": 9.933684317100334e-05, + "loss": 1.9443, + "step": 141 + }, + { + "epoch": 0.043102139930186675, + "grad_norm": 0.41939428448677063, + "learning_rate": 9.933178090513315e-05, + "loss": 1.7401, + "step": 142 + }, + { + "epoch": 0.04340567612687813, + "grad_norm": 0.4531096816062927, + "learning_rate": 9.932671863926294e-05, + "loss": 1.9944, + "step": 143 + }, + { + "epoch": 0.043709212323569585, + "grad_norm": 0.44440799951553345, + "learning_rate": 9.932165637339274e-05, + "loss": 1.9648, + "step": 144 + }, + { + "epoch": 0.04401274852026104, + "grad_norm": 0.36847150325775146, + "learning_rate": 9.931659410752253e-05, + "loss": 2.0638, + "step": 145 + }, + { + "epoch": 0.044316284716952495, + "grad_norm": 0.6394171118736267, + "learning_rate": 9.931153184165233e-05, + "loss": 1.9476, + "step": 146 + }, + { + "epoch": 0.04461982091364395, + "grad_norm": 0.41597506403923035, + "learning_rate": 9.930646957578212e-05, + "loss": 1.535, + "step": 147 + }, + { + "epoch": 0.044923357110335405, + "grad_norm": 0.5597077012062073, + "learning_rate": 9.930140730991192e-05, + "loss": 1.6826, + "step": 148 + }, + { + "epoch": 0.045226893307026866, + "grad_norm": 0.5532084703445435, + "learning_rate": 9.929634504404171e-05, + "loss": 1.8063, + "step": 149 + }, + { + "epoch": 0.04553042950371832, + "grad_norm": 0.467339426279068, + "learning_rate": 9.92912827781715e-05, + "loss": 2.017, + "step": 150 + }, + { + "epoch": 0.045833965700409776, + "grad_norm": 0.4054040312767029, + "learning_rate": 9.928622051230131e-05, + "loss": 1.7582, + "step": 151 + }, + { + "epoch": 0.04613750189710123, + "grad_norm": 1.2743823528289795, + "learning_rate": 9.928115824643111e-05, + "loss": 2.0202, + "step": 152 + }, + { + "epoch": 0.046441038093792686, + "grad_norm": 0.4357397258281708, + "learning_rate": 9.92760959805609e-05, + "loss": 1.8788, + "step": 153 + }, + { + "epoch": 0.04674457429048414, + "grad_norm": 2.8793208599090576, + "learning_rate": 9.92710337146907e-05, + "loss": 2.1204, + "step": 154 + }, + { + "epoch": 0.047048110487175596, + "grad_norm": 0.9585952162742615, + "learning_rate": 9.92659714488205e-05, + "loss": 1.9356, + "step": 155 + }, + { + "epoch": 0.04735164668386705, + "grad_norm": 0.7857603430747986, + "learning_rate": 9.926090918295029e-05, + "loss": 1.9097, + "step": 156 + }, + { + "epoch": 0.047655182880558505, + "grad_norm": 0.5259221792221069, + "learning_rate": 9.925584691708008e-05, + "loss": 2.1589, + "step": 157 + }, + { + "epoch": 0.04795871907724996, + "grad_norm": 2.793253183364868, + "learning_rate": 9.925078465120988e-05, + "loss": 1.7202, + "step": 158 + }, + { + "epoch": 0.048262255273941415, + "grad_norm": 0.4432888627052307, + "learning_rate": 9.924572238533967e-05, + "loss": 1.9898, + "step": 159 + }, + { + "epoch": 0.04856579147063287, + "grad_norm": 0.4347291588783264, + "learning_rate": 9.924066011946948e-05, + "loss": 1.8142, + "step": 160 + }, + { + "epoch": 0.04886932766732433, + "grad_norm": 5.273514747619629, + "learning_rate": 9.923559785359928e-05, + "loss": 1.8665, + "step": 161 + }, + { + "epoch": 0.04917286386401579, + "grad_norm": 0.47988301515579224, + "learning_rate": 9.923053558772907e-05, + "loss": 1.9439, + "step": 162 + }, + { + "epoch": 0.04947640006070724, + "grad_norm": 0.3584117293357849, + "learning_rate": 9.922547332185887e-05, + "loss": 1.8109, + "step": 163 + }, + { + "epoch": 0.049779936257398696, + "grad_norm": 0.4074074923992157, + "learning_rate": 9.922041105598866e-05, + "loss": 2.1056, + "step": 164 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 3.159336566925049, + "learning_rate": 9.921534879011846e-05, + "loss": 1.8672, + "step": 165 + }, + { + "epoch": 0.050387008650781606, + "grad_norm": 0.38132309913635254, + "learning_rate": 9.921028652424826e-05, + "loss": 1.8423, + "step": 166 + }, + { + "epoch": 0.05069054484747306, + "grad_norm": 0.39241936802864075, + "learning_rate": 9.920522425837806e-05, + "loss": 1.5949, + "step": 167 + }, + { + "epoch": 0.050994081044164516, + "grad_norm": 0.38212037086486816, + "learning_rate": 9.920016199250785e-05, + "loss": 1.9669, + "step": 168 + }, + { + "epoch": 0.05129761724085597, + "grad_norm": 0.5353955030441284, + "learning_rate": 9.919509972663765e-05, + "loss": 2.1806, + "step": 169 + }, + { + "epoch": 0.051601153437547426, + "grad_norm": 0.4129483699798584, + "learning_rate": 9.919003746076744e-05, + "loss": 1.8858, + "step": 170 + }, + { + "epoch": 0.05190468963423888, + "grad_norm": 0.3832380771636963, + "learning_rate": 9.918497519489725e-05, + "loss": 2.0321, + "step": 171 + }, + { + "epoch": 0.052208225830930335, + "grad_norm": 0.4078863859176636, + "learning_rate": 9.917991292902705e-05, + "loss": 1.6213, + "step": 172 + }, + { + "epoch": 0.0525117620276218, + "grad_norm": 0.38865014910697937, + "learning_rate": 9.917485066315684e-05, + "loss": 2.0052, + "step": 173 + }, + { + "epoch": 0.05281529822431325, + "grad_norm": 0.4339440166950226, + "learning_rate": 9.916978839728664e-05, + "loss": 2.2405, + "step": 174 + }, + { + "epoch": 0.05311883442100471, + "grad_norm": 0.42063045501708984, + "learning_rate": 9.916472613141643e-05, + "loss": 1.6529, + "step": 175 + }, + { + "epoch": 0.05342237061769616, + "grad_norm": 0.4765849709510803, + "learning_rate": 9.915966386554623e-05, + "loss": 1.9645, + "step": 176 + }, + { + "epoch": 0.05372590681438762, + "grad_norm": 0.41431936621665955, + "learning_rate": 9.915460159967602e-05, + "loss": 1.9709, + "step": 177 + }, + { + "epoch": 0.05402944301107907, + "grad_norm": 0.3591434359550476, + "learning_rate": 9.914953933380581e-05, + "loss": 1.685, + "step": 178 + }, + { + "epoch": 0.054332979207770526, + "grad_norm": 0.45483240485191345, + "learning_rate": 9.914447706793561e-05, + "loss": 1.9362, + "step": 179 + }, + { + "epoch": 0.05463651540446198, + "grad_norm": 0.5468000173568726, + "learning_rate": 9.91394148020654e-05, + "loss": 1.6984, + "step": 180 + }, + { + "epoch": 0.054940051601153436, + "grad_norm": 0.4057190716266632, + "learning_rate": 9.913435253619521e-05, + "loss": 1.9887, + "step": 181 + }, + { + "epoch": 0.05524358779784489, + "grad_norm": 0.383211612701416, + "learning_rate": 9.912929027032501e-05, + "loss": 1.7825, + "step": 182 + }, + { + "epoch": 0.055547123994536346, + "grad_norm": 0.3480004668235779, + "learning_rate": 9.91242280044548e-05, + "loss": 1.8721, + "step": 183 + }, + { + "epoch": 0.0558506601912278, + "grad_norm": 0.47680413722991943, + "learning_rate": 9.91191657385846e-05, + "loss": 1.8113, + "step": 184 + }, + { + "epoch": 0.05615419638791926, + "grad_norm": 0.37727096676826477, + "learning_rate": 9.911410347271439e-05, + "loss": 1.7398, + "step": 185 + }, + { + "epoch": 0.05645773258461072, + "grad_norm": 0.47738176584243774, + "learning_rate": 9.910904120684419e-05, + "loss": 1.4651, + "step": 186 + }, + { + "epoch": 0.05676126878130217, + "grad_norm": 0.44533729553222656, + "learning_rate": 9.910397894097398e-05, + "loss": 1.5697, + "step": 187 + }, + { + "epoch": 0.05706480497799363, + "grad_norm": 0.45051974058151245, + "learning_rate": 9.909891667510378e-05, + "loss": 2.1577, + "step": 188 + }, + { + "epoch": 0.05736834117468508, + "grad_norm": 0.4709470272064209, + "learning_rate": 9.909385440923357e-05, + "loss": 2.0486, + "step": 189 + }, + { + "epoch": 0.05767187737137654, + "grad_norm": 0.4063846170902252, + "learning_rate": 9.908879214336338e-05, + "loss": 1.5453, + "step": 190 + }, + { + "epoch": 0.05797541356806799, + "grad_norm": 0.374362587928772, + "learning_rate": 9.908372987749317e-05, + "loss": 1.5611, + "step": 191 + }, + { + "epoch": 0.05827894976475945, + "grad_norm": 0.4852111041545868, + "learning_rate": 9.907866761162297e-05, + "loss": 1.6234, + "step": 192 + }, + { + "epoch": 0.0585824859614509, + "grad_norm": 0.6863122582435608, + "learning_rate": 9.907360534575276e-05, + "loss": 2.1612, + "step": 193 + }, + { + "epoch": 0.058886022158142357, + "grad_norm": 0.6040588021278381, + "learning_rate": 9.906854307988256e-05, + "loss": 2.1092, + "step": 194 + }, + { + "epoch": 0.05918955835483381, + "grad_norm": 0.4148467779159546, + "learning_rate": 9.906348081401235e-05, + "loss": 2.1108, + "step": 195 + }, + { + "epoch": 0.059493094551525266, + "grad_norm": 0.36098209023475647, + "learning_rate": 9.905841854814215e-05, + "loss": 2.0002, + "step": 196 + }, + { + "epoch": 0.05979663074821673, + "grad_norm": 0.42360183596611023, + "learning_rate": 9.905335628227194e-05, + "loss": 2.3124, + "step": 197 + }, + { + "epoch": 0.06010016694490818, + "grad_norm": 0.3650914430618286, + "learning_rate": 9.904829401640174e-05, + "loss": 1.8778, + "step": 198 + }, + { + "epoch": 0.06040370314159964, + "grad_norm": 0.392995148897171, + "learning_rate": 9.904323175053155e-05, + "loss": 2.16, + "step": 199 + }, + { + "epoch": 0.06070723933829109, + "grad_norm": 0.46390387415885925, + "learning_rate": 9.903816948466134e-05, + "loss": 1.8695, + "step": 200 + }, + { + "epoch": 0.06101077553498255, + "grad_norm": 0.3954870402812958, + "learning_rate": 9.903310721879114e-05, + "loss": 1.9233, + "step": 201 + }, + { + "epoch": 0.061314311731674, + "grad_norm": 0.3650193214416504, + "learning_rate": 9.902804495292093e-05, + "loss": 2.2504, + "step": 202 + }, + { + "epoch": 0.06161784792836546, + "grad_norm": 0.3582104742527008, + "learning_rate": 9.902298268705073e-05, + "loss": 1.9303, + "step": 203 + }, + { + "epoch": 0.06192138412505691, + "grad_norm": 0.35688868165016174, + "learning_rate": 9.901792042118052e-05, + "loss": 1.7078, + "step": 204 + }, + { + "epoch": 0.06222492032174837, + "grad_norm": 0.3666802942752838, + "learning_rate": 9.901285815531031e-05, + "loss": 1.941, + "step": 205 + }, + { + "epoch": 0.06252845651843983, + "grad_norm": 0.42375093698501587, + "learning_rate": 9.900779588944011e-05, + "loss": 2.0858, + "step": 206 + }, + { + "epoch": 0.06283199271513128, + "grad_norm": 0.3913770318031311, + "learning_rate": 9.90027336235699e-05, + "loss": 2.1423, + "step": 207 + }, + { + "epoch": 0.06313552891182274, + "grad_norm": 0.4101809859275818, + "learning_rate": 9.89976713576997e-05, + "loss": 2.0497, + "step": 208 + }, + { + "epoch": 0.06343906510851419, + "grad_norm": 0.3696439564228058, + "learning_rate": 9.899260909182951e-05, + "loss": 1.9692, + "step": 209 + }, + { + "epoch": 0.06374260130520565, + "grad_norm": 0.3725574016571045, + "learning_rate": 9.89875468259593e-05, + "loss": 2.2053, + "step": 210 + }, + { + "epoch": 0.0640461375018971, + "grad_norm": 0.4886903166770935, + "learning_rate": 9.898248456008911e-05, + "loss": 1.8981, + "step": 211 + }, + { + "epoch": 0.06434967369858856, + "grad_norm": 0.4423249661922455, + "learning_rate": 9.89774222942189e-05, + "loss": 1.9058, + "step": 212 + }, + { + "epoch": 0.06465320989528, + "grad_norm": 0.4045765697956085, + "learning_rate": 9.89723600283487e-05, + "loss": 1.8056, + "step": 213 + }, + { + "epoch": 0.06495674609197147, + "grad_norm": 0.43866047263145447, + "learning_rate": 9.89672977624785e-05, + "loss": 1.6315, + "step": 214 + }, + { + "epoch": 0.06526028228866293, + "grad_norm": 0.524714469909668, + "learning_rate": 9.896223549660829e-05, + "loss": 2.0156, + "step": 215 + }, + { + "epoch": 0.06556381848535438, + "grad_norm": 0.3752996325492859, + "learning_rate": 9.895717323073808e-05, + "loss": 2.2768, + "step": 216 + }, + { + "epoch": 0.06586735468204584, + "grad_norm": 0.4371670186519623, + "learning_rate": 9.895211096486788e-05, + "loss": 2.0755, + "step": 217 + }, + { + "epoch": 0.06617089087873729, + "grad_norm": 0.3751063644886017, + "learning_rate": 9.894704869899767e-05, + "loss": 2.2451, + "step": 218 + }, + { + "epoch": 0.06647442707542875, + "grad_norm": 0.6649600267410278, + "learning_rate": 9.894198643312747e-05, + "loss": 1.9835, + "step": 219 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.3941735625267029, + "learning_rate": 9.893692416725728e-05, + "loss": 2.0203, + "step": 220 + }, + { + "epoch": 0.06708149946881166, + "grad_norm": 0.41888293623924255, + "learning_rate": 9.893186190138707e-05, + "loss": 1.7572, + "step": 221 + }, + { + "epoch": 0.0673850356655031, + "grad_norm": 0.4820149838924408, + "learning_rate": 9.892679963551687e-05, + "loss": 2.0591, + "step": 222 + }, + { + "epoch": 0.06768857186219457, + "grad_norm": 0.3516736626625061, + "learning_rate": 9.892173736964666e-05, + "loss": 1.9398, + "step": 223 + }, + { + "epoch": 0.06799210805888602, + "grad_norm": 0.3873218894004822, + "learning_rate": 9.891667510377646e-05, + "loss": 1.6389, + "step": 224 + }, + { + "epoch": 0.06829564425557748, + "grad_norm": 0.3793487846851349, + "learning_rate": 9.891161283790625e-05, + "loss": 2.0075, + "step": 225 + }, + { + "epoch": 0.06859918045226893, + "grad_norm": 0.38987675309181213, + "learning_rate": 9.890655057203605e-05, + "loss": 2.0903, + "step": 226 + }, + { + "epoch": 0.06890271664896039, + "grad_norm": 0.4293549358844757, + "learning_rate": 9.890148830616584e-05, + "loss": 2.2099, + "step": 227 + }, + { + "epoch": 0.06920625284565185, + "grad_norm": 0.39895692467689514, + "learning_rate": 9.889642604029564e-05, + "loss": 1.8615, + "step": 228 + }, + { + "epoch": 0.0695097890423433, + "grad_norm": 0.4543936252593994, + "learning_rate": 9.889136377442544e-05, + "loss": 2.0828, + "step": 229 + }, + { + "epoch": 0.06981332523903476, + "grad_norm": 0.448477566242218, + "learning_rate": 9.888630150855524e-05, + "loss": 1.5524, + "step": 230 + }, + { + "epoch": 0.07011686143572621, + "grad_norm": 0.428975373506546, + "learning_rate": 9.888123924268503e-05, + "loss": 1.3828, + "step": 231 + }, + { + "epoch": 0.07042039763241767, + "grad_norm": 0.42287349700927734, + "learning_rate": 9.887617697681483e-05, + "loss": 2.096, + "step": 232 + }, + { + "epoch": 0.07072393382910912, + "grad_norm": 0.43614649772644043, + "learning_rate": 9.887111471094462e-05, + "loss": 1.8238, + "step": 233 + }, + { + "epoch": 0.07102747002580058, + "grad_norm": 0.47309553623199463, + "learning_rate": 9.886605244507442e-05, + "loss": 2.3526, + "step": 234 + }, + { + "epoch": 0.07133100622249203, + "grad_norm": 0.9558483362197876, + "learning_rate": 9.886099017920421e-05, + "loss": 1.9816, + "step": 235 + }, + { + "epoch": 0.07163454241918349, + "grad_norm": 0.3529858887195587, + "learning_rate": 9.885592791333401e-05, + "loss": 2.0314, + "step": 236 + }, + { + "epoch": 0.07193807861587494, + "grad_norm": 0.37652599811553955, + "learning_rate": 9.88508656474638e-05, + "loss": 1.9381, + "step": 237 + }, + { + "epoch": 0.0722416148125664, + "grad_norm": 0.40783143043518066, + "learning_rate": 9.884580338159361e-05, + "loss": 1.966, + "step": 238 + }, + { + "epoch": 0.07254515100925786, + "grad_norm": 0.4160328805446625, + "learning_rate": 9.88407411157234e-05, + "loss": 1.8176, + "step": 239 + }, + { + "epoch": 0.07284868720594931, + "grad_norm": 0.4397304952144623, + "learning_rate": 9.88356788498532e-05, + "loss": 1.6766, + "step": 240 + }, + { + "epoch": 0.07315222340264077, + "grad_norm": 0.42549702525138855, + "learning_rate": 9.8830616583983e-05, + "loss": 2.1176, + "step": 241 + }, + { + "epoch": 0.07345575959933222, + "grad_norm": 0.3747939169406891, + "learning_rate": 9.882555431811279e-05, + "loss": 1.5494, + "step": 242 + }, + { + "epoch": 0.07375929579602368, + "grad_norm": 3.4551990032196045, + "learning_rate": 9.882049205224258e-05, + "loss": 2.0336, + "step": 243 + }, + { + "epoch": 0.07406283199271513, + "grad_norm": 1.5632964372634888, + "learning_rate": 9.881542978637238e-05, + "loss": 1.7452, + "step": 244 + }, + { + "epoch": 0.07436636818940659, + "grad_norm": 0.41575855016708374, + "learning_rate": 9.881036752050217e-05, + "loss": 2.0243, + "step": 245 + }, + { + "epoch": 0.07466990438609804, + "grad_norm": 0.44168713688850403, + "learning_rate": 9.880530525463197e-05, + "loss": 2.0022, + "step": 246 + }, + { + "epoch": 0.0749734405827895, + "grad_norm": 0.46640321612358093, + "learning_rate": 9.880024298876176e-05, + "loss": 1.555, + "step": 247 + }, + { + "epoch": 0.07527697677948095, + "grad_norm": 0.3622835576534271, + "learning_rate": 9.879518072289157e-05, + "loss": 1.876, + "step": 248 + }, + { + "epoch": 0.07558051297617241, + "grad_norm": 0.6277987957000732, + "learning_rate": 9.879011845702137e-05, + "loss": 2.2753, + "step": 249 + }, + { + "epoch": 0.07588404917286386, + "grad_norm": 0.40246644616127014, + "learning_rate": 9.878505619115116e-05, + "loss": 1.5991, + "step": 250 + }, + { + "epoch": 0.07618758536955532, + "grad_norm": 0.38388529419898987, + "learning_rate": 9.877999392528096e-05, + "loss": 1.9226, + "step": 251 + }, + { + "epoch": 0.07649112156624678, + "grad_norm": 0.39985090494155884, + "learning_rate": 9.877493165941075e-05, + "loss": 2.0722, + "step": 252 + }, + { + "epoch": 0.07679465776293823, + "grad_norm": 0.3872128427028656, + "learning_rate": 9.876986939354055e-05, + "loss": 1.9132, + "step": 253 + }, + { + "epoch": 0.07709819395962969, + "grad_norm": 0.3665171265602112, + "learning_rate": 9.876480712767034e-05, + "loss": 1.6244, + "step": 254 + }, + { + "epoch": 0.07740173015632114, + "grad_norm": 0.4011310040950775, + "learning_rate": 9.875974486180015e-05, + "loss": 2.1289, + "step": 255 + }, + { + "epoch": 0.0777052663530126, + "grad_norm": 0.35013166069984436, + "learning_rate": 9.875468259592994e-05, + "loss": 1.9738, + "step": 256 + }, + { + "epoch": 0.07800880254970405, + "grad_norm": 0.48468607664108276, + "learning_rate": 9.874962033005974e-05, + "loss": 2.1368, + "step": 257 + }, + { + "epoch": 0.07831233874639551, + "grad_norm": 0.5015551447868347, + "learning_rate": 9.874455806418953e-05, + "loss": 2.1218, + "step": 258 + }, + { + "epoch": 0.07861587494308696, + "grad_norm": 0.41915133595466614, + "learning_rate": 9.873949579831934e-05, + "loss": 2.0052, + "step": 259 + }, + { + "epoch": 0.07891941113977842, + "grad_norm": 0.4414760172367096, + "learning_rate": 9.873443353244914e-05, + "loss": 1.7249, + "step": 260 + }, + { + "epoch": 0.07922294733646987, + "grad_norm": 0.47259169816970825, + "learning_rate": 9.872937126657893e-05, + "loss": 2.1041, + "step": 261 + }, + { + "epoch": 0.07952648353316133, + "grad_norm": 0.3689124882221222, + "learning_rate": 9.872430900070873e-05, + "loss": 1.8956, + "step": 262 + }, + { + "epoch": 0.07983001972985279, + "grad_norm": 0.3948320150375366, + "learning_rate": 9.871924673483852e-05, + "loss": 1.9211, + "step": 263 + }, + { + "epoch": 0.08013355592654424, + "grad_norm": 0.4235248267650604, + "learning_rate": 9.871418446896832e-05, + "loss": 1.7115, + "step": 264 + }, + { + "epoch": 0.0804370921232357, + "grad_norm": 0.48399198055267334, + "learning_rate": 9.870912220309811e-05, + "loss": 1.77, + "step": 265 + }, + { + "epoch": 0.08074062831992715, + "grad_norm": 0.34047526121139526, + "learning_rate": 9.87040599372279e-05, + "loss": 1.7189, + "step": 266 + }, + { + "epoch": 0.08104416451661861, + "grad_norm": 0.47203269600868225, + "learning_rate": 9.86989976713577e-05, + "loss": 1.7674, + "step": 267 + }, + { + "epoch": 0.08134770071331006, + "grad_norm": 0.3752756118774414, + "learning_rate": 9.869393540548751e-05, + "loss": 1.8716, + "step": 268 + }, + { + "epoch": 0.08165123691000152, + "grad_norm": 0.3437153697013855, + "learning_rate": 9.86888731396173e-05, + "loss": 1.9824, + "step": 269 + }, + { + "epoch": 0.08195477310669297, + "grad_norm": 0.4854094088077545, + "learning_rate": 9.86838108737471e-05, + "loss": 1.4385, + "step": 270 + }, + { + "epoch": 0.08225830930338443, + "grad_norm": 0.37674829363822937, + "learning_rate": 9.86787486078769e-05, + "loss": 1.7877, + "step": 271 + }, + { + "epoch": 0.08256184550007588, + "grad_norm": 0.4215140640735626, + "learning_rate": 9.867368634200669e-05, + "loss": 2.1854, + "step": 272 + }, + { + "epoch": 0.08286538169676734, + "grad_norm": 0.3680359423160553, + "learning_rate": 9.866862407613648e-05, + "loss": 2.104, + "step": 273 + }, + { + "epoch": 0.08316891789345879, + "grad_norm": 0.4195649325847626, + "learning_rate": 9.866356181026628e-05, + "loss": 1.469, + "step": 274 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.480640709400177, + "learning_rate": 9.865849954439607e-05, + "loss": 1.8329, + "step": 275 + }, + { + "epoch": 0.08377599028684171, + "grad_norm": 0.34760695695877075, + "learning_rate": 9.865343727852587e-05, + "loss": 1.9495, + "step": 276 + }, + { + "epoch": 0.08407952648353316, + "grad_norm": 0.3803161680698395, + "learning_rate": 9.864837501265568e-05, + "loss": 1.9294, + "step": 277 + }, + { + "epoch": 0.08438306268022462, + "grad_norm": 0.41739675402641296, + "learning_rate": 9.864331274678547e-05, + "loss": 2.059, + "step": 278 + }, + { + "epoch": 0.08468659887691607, + "grad_norm": 0.3807448744773865, + "learning_rate": 9.863825048091527e-05, + "loss": 1.9741, + "step": 279 + }, + { + "epoch": 0.08499013507360753, + "grad_norm": 0.3610997200012207, + "learning_rate": 9.863318821504506e-05, + "loss": 1.9815, + "step": 280 + }, + { + "epoch": 0.08529367127029898, + "grad_norm": 0.3797460198402405, + "learning_rate": 9.862812594917485e-05, + "loss": 2.1394, + "step": 281 + }, + { + "epoch": 0.08559720746699044, + "grad_norm": 0.3922887444496155, + "learning_rate": 9.862306368330465e-05, + "loss": 2.184, + "step": 282 + }, + { + "epoch": 0.08590074366368189, + "grad_norm": 0.38251930475234985, + "learning_rate": 9.861800141743444e-05, + "loss": 2.0186, + "step": 283 + }, + { + "epoch": 0.08620427986037335, + "grad_norm": 0.35968562960624695, + "learning_rate": 9.861293915156424e-05, + "loss": 2.0, + "step": 284 + }, + { + "epoch": 0.0865078160570648, + "grad_norm": 0.37149590253829956, + "learning_rate": 9.860787688569403e-05, + "loss": 1.7941, + "step": 285 + }, + { + "epoch": 0.08681135225375626, + "grad_norm": 0.36890628933906555, + "learning_rate": 9.860281461982383e-05, + "loss": 1.906, + "step": 286 + }, + { + "epoch": 0.08711488845044772, + "grad_norm": 0.36025917530059814, + "learning_rate": 9.859775235395364e-05, + "loss": 1.9655, + "step": 287 + }, + { + "epoch": 0.08741842464713917, + "grad_norm": 0.3704364001750946, + "learning_rate": 9.859269008808343e-05, + "loss": 1.8657, + "step": 288 + }, + { + "epoch": 0.08772196084383063, + "grad_norm": 0.5996513962745667, + "learning_rate": 9.858762782221323e-05, + "loss": 1.7448, + "step": 289 + }, + { + "epoch": 0.08802549704052208, + "grad_norm": 0.3615630269050598, + "learning_rate": 9.858256555634302e-05, + "loss": 1.9007, + "step": 290 + }, + { + "epoch": 0.08832903323721354, + "grad_norm": 0.36014246940612793, + "learning_rate": 9.857750329047282e-05, + "loss": 1.927, + "step": 291 + }, + { + "epoch": 0.08863256943390499, + "grad_norm": 0.5038754940032959, + "learning_rate": 9.857244102460261e-05, + "loss": 1.6613, + "step": 292 + }, + { + "epoch": 0.08893610563059645, + "grad_norm": 0.3880213797092438, + "learning_rate": 9.85673787587324e-05, + "loss": 1.5563, + "step": 293 + }, + { + "epoch": 0.0892396418272879, + "grad_norm": 0.43225082755088806, + "learning_rate": 9.85623164928622e-05, + "loss": 1.5534, + "step": 294 + }, + { + "epoch": 0.08954317802397936, + "grad_norm": 0.44342055916786194, + "learning_rate": 9.8557254226992e-05, + "loss": 1.6211, + "step": 295 + }, + { + "epoch": 0.08984671422067081, + "grad_norm": 0.42114123702049255, + "learning_rate": 9.85521919611218e-05, + "loss": 1.9731, + "step": 296 + }, + { + "epoch": 0.09015025041736227, + "grad_norm": 0.43151113390922546, + "learning_rate": 9.85471296952516e-05, + "loss": 1.9519, + "step": 297 + }, + { + "epoch": 0.09045378661405373, + "grad_norm": 0.38092517852783203, + "learning_rate": 9.85420674293814e-05, + "loss": 2.0973, + "step": 298 + }, + { + "epoch": 0.09075732281074518, + "grad_norm": 0.40729570388793945, + "learning_rate": 9.853700516351119e-05, + "loss": 1.4395, + "step": 299 + }, + { + "epoch": 0.09106085900743664, + "grad_norm": 0.3631846308708191, + "learning_rate": 9.8531942897641e-05, + "loss": 1.2255, + "step": 300 + }, + { + "epoch": 0.09136439520412809, + "grad_norm": 0.37764397263526917, + "learning_rate": 9.852688063177079e-05, + "loss": 1.9941, + "step": 301 + }, + { + "epoch": 0.09166793140081955, + "grad_norm": 0.3755379319190979, + "learning_rate": 9.852181836590059e-05, + "loss": 1.7154, + "step": 302 + }, + { + "epoch": 0.091971467597511, + "grad_norm": 0.39003854990005493, + "learning_rate": 9.851675610003038e-05, + "loss": 1.928, + "step": 303 + }, + { + "epoch": 0.09227500379420246, + "grad_norm": 0.39592432975769043, + "learning_rate": 9.851169383416018e-05, + "loss": 2.1913, + "step": 304 + }, + { + "epoch": 0.09257853999089391, + "grad_norm": 0.4315894842147827, + "learning_rate": 9.850663156828997e-05, + "loss": 1.6432, + "step": 305 + }, + { + "epoch": 0.09288207618758537, + "grad_norm": 0.4103511571884155, + "learning_rate": 9.850156930241977e-05, + "loss": 1.9944, + "step": 306 + }, + { + "epoch": 0.09318561238427682, + "grad_norm": 0.4236547350883484, + "learning_rate": 9.849650703654957e-05, + "loss": 1.875, + "step": 307 + }, + { + "epoch": 0.09348914858096828, + "grad_norm": 0.41012468934059143, + "learning_rate": 9.849144477067937e-05, + "loss": 2.008, + "step": 308 + }, + { + "epoch": 0.09379268477765973, + "grad_norm": 0.35538622736930847, + "learning_rate": 9.848638250480916e-05, + "loss": 1.7322, + "step": 309 + }, + { + "epoch": 0.09409622097435119, + "grad_norm": 0.3874755799770355, + "learning_rate": 9.848132023893896e-05, + "loss": 1.9818, + "step": 310 + }, + { + "epoch": 0.09439975717104265, + "grad_norm": 0.42444977164268494, + "learning_rate": 9.847625797306875e-05, + "loss": 2.1606, + "step": 311 + }, + { + "epoch": 0.0947032933677341, + "grad_norm": 0.5855305194854736, + "learning_rate": 9.847119570719855e-05, + "loss": 1.4887, + "step": 312 + }, + { + "epoch": 0.09500682956442556, + "grad_norm": 0.35223227739334106, + "learning_rate": 9.846613344132834e-05, + "loss": 2.0025, + "step": 313 + }, + { + "epoch": 0.09531036576111701, + "grad_norm": 0.4013148844242096, + "learning_rate": 9.846107117545814e-05, + "loss": 1.9702, + "step": 314 + }, + { + "epoch": 0.09561390195780847, + "grad_norm": 0.5038349032402039, + "learning_rate": 9.845600890958793e-05, + "loss": 2.1532, + "step": 315 + }, + { + "epoch": 0.09591743815449992, + "grad_norm": 0.4826093018054962, + "learning_rate": 9.845094664371774e-05, + "loss": 2.0118, + "step": 316 + }, + { + "epoch": 0.09622097435119138, + "grad_norm": 0.41135913133621216, + "learning_rate": 9.844588437784754e-05, + "loss": 2.0707, + "step": 317 + }, + { + "epoch": 0.09652451054788283, + "grad_norm": 0.4353053569793701, + "learning_rate": 9.844082211197733e-05, + "loss": 2.104, + "step": 318 + }, + { + "epoch": 0.09682804674457429, + "grad_norm": 0.4192908704280853, + "learning_rate": 9.843575984610712e-05, + "loss": 1.9489, + "step": 319 + }, + { + "epoch": 0.09713158294126574, + "grad_norm": 0.380562424659729, + "learning_rate": 9.843069758023692e-05, + "loss": 1.3602, + "step": 320 + }, + { + "epoch": 0.0974351191379572, + "grad_norm": 0.3394995331764221, + "learning_rate": 9.842563531436671e-05, + "loss": 2.2161, + "step": 321 + }, + { + "epoch": 0.09773865533464866, + "grad_norm": 0.3419237434864044, + "learning_rate": 9.842057304849651e-05, + "loss": 1.7146, + "step": 322 + }, + { + "epoch": 0.09804219153134011, + "grad_norm": 0.3590264618396759, + "learning_rate": 9.84155107826263e-05, + "loss": 1.8654, + "step": 323 + }, + { + "epoch": 0.09834572772803157, + "grad_norm": 0.40006300806999207, + "learning_rate": 9.84104485167561e-05, + "loss": 1.5787, + "step": 324 + }, + { + "epoch": 0.09864926392472302, + "grad_norm": 0.33313074707984924, + "learning_rate": 9.84053862508859e-05, + "loss": 1.8653, + "step": 325 + }, + { + "epoch": 0.09895280012141448, + "grad_norm": 0.39681655168533325, + "learning_rate": 9.84003239850157e-05, + "loss": 2.178, + "step": 326 + }, + { + "epoch": 0.09925633631810593, + "grad_norm": 0.41945868730545044, + "learning_rate": 9.83952617191455e-05, + "loss": 1.8324, + "step": 327 + }, + { + "epoch": 0.09955987251479739, + "grad_norm": 0.3957304060459137, + "learning_rate": 9.839019945327529e-05, + "loss": 1.6468, + "step": 328 + }, + { + "epoch": 0.09986340871148884, + "grad_norm": 0.35814937949180603, + "learning_rate": 9.838513718740509e-05, + "loss": 1.6492, + "step": 329 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.38410916924476624, + "learning_rate": 9.838007492153488e-05, + "loss": 1.7223, + "step": 330 + }, + { + "epoch": 0.10047048110487175, + "grad_norm": 0.38490885496139526, + "learning_rate": 9.837501265566468e-05, + "loss": 2.0166, + "step": 331 + }, + { + "epoch": 0.10077401730156321, + "grad_norm": 0.38943415880203247, + "learning_rate": 9.836995038979447e-05, + "loss": 1.371, + "step": 332 + }, + { + "epoch": 0.10107755349825466, + "grad_norm": 0.39741018414497375, + "learning_rate": 9.836488812392427e-05, + "loss": 1.6233, + "step": 333 + }, + { + "epoch": 0.10138108969494612, + "grad_norm": 0.4663957357406616, + "learning_rate": 9.835982585805406e-05, + "loss": 1.746, + "step": 334 + }, + { + "epoch": 0.10168462589163758, + "grad_norm": 0.37118905782699585, + "learning_rate": 9.835476359218387e-05, + "loss": 1.9684, + "step": 335 + }, + { + "epoch": 0.10198816208832903, + "grad_norm": 0.40275588631629944, + "learning_rate": 9.834970132631366e-05, + "loss": 1.9551, + "step": 336 + }, + { + "epoch": 0.1022916982850205, + "grad_norm": 0.4336283206939697, + "learning_rate": 9.834463906044346e-05, + "loss": 2.0711, + "step": 337 + }, + { + "epoch": 0.10259523448171194, + "grad_norm": 0.35735735297203064, + "learning_rate": 9.833957679457325e-05, + "loss": 2.1397, + "step": 338 + }, + { + "epoch": 0.1028987706784034, + "grad_norm": 0.37825390696525574, + "learning_rate": 9.833451452870305e-05, + "loss": 1.7494, + "step": 339 + }, + { + "epoch": 0.10320230687509485, + "grad_norm": 0.3384961783885956, + "learning_rate": 9.832945226283284e-05, + "loss": 2.0197, + "step": 340 + }, + { + "epoch": 0.10350584307178631, + "grad_norm": 0.46276888251304626, + "learning_rate": 9.832438999696264e-05, + "loss": 1.797, + "step": 341 + }, + { + "epoch": 0.10380937926847776, + "grad_norm": 0.3685421347618103, + "learning_rate": 9.831932773109243e-05, + "loss": 1.9301, + "step": 342 + }, + { + "epoch": 0.10411291546516922, + "grad_norm": 0.38931936025619507, + "learning_rate": 9.831426546522223e-05, + "loss": 1.9623, + "step": 343 + }, + { + "epoch": 0.10441645166186067, + "grad_norm": 0.46678805351257324, + "learning_rate": 9.830920319935204e-05, + "loss": 1.6708, + "step": 344 + }, + { + "epoch": 0.10471998785855213, + "grad_norm": 0.4199204444885254, + "learning_rate": 9.830414093348183e-05, + "loss": 1.8014, + "step": 345 + }, + { + "epoch": 0.1050235240552436, + "grad_norm": 0.41024506092071533, + "learning_rate": 9.829907866761164e-05, + "loss": 1.8829, + "step": 346 + }, + { + "epoch": 0.10532706025193504, + "grad_norm": 0.5271286368370056, + "learning_rate": 9.829401640174143e-05, + "loss": 1.7796, + "step": 347 + }, + { + "epoch": 0.1056305964486265, + "grad_norm": 0.3593878448009491, + "learning_rate": 9.828895413587123e-05, + "loss": 2.0697, + "step": 348 + }, + { + "epoch": 0.10593413264531795, + "grad_norm": 0.44404372572898865, + "learning_rate": 9.828389187000102e-05, + "loss": 2.3235, + "step": 349 + }, + { + "epoch": 0.10623766884200941, + "grad_norm": 0.4072231650352478, + "learning_rate": 9.827882960413082e-05, + "loss": 1.5391, + "step": 350 + }, + { + "epoch": 0.10654120503870086, + "grad_norm": 0.3924303352832794, + "learning_rate": 9.827376733826061e-05, + "loss": 2.0649, + "step": 351 + }, + { + "epoch": 0.10684474123539232, + "grad_norm": 0.3815264105796814, + "learning_rate": 9.826870507239041e-05, + "loss": 1.5821, + "step": 352 + }, + { + "epoch": 0.10714827743208377, + "grad_norm": 0.40832409262657166, + "learning_rate": 9.82636428065202e-05, + "loss": 2.1135, + "step": 353 + }, + { + "epoch": 0.10745181362877523, + "grad_norm": 0.40270155668258667, + "learning_rate": 9.825858054065e-05, + "loss": 1.6561, + "step": 354 + }, + { + "epoch": 0.10775534982546668, + "grad_norm": 0.38295283913612366, + "learning_rate": 9.82535182747798e-05, + "loss": 1.8938, + "step": 355 + }, + { + "epoch": 0.10805888602215814, + "grad_norm": 0.41975417733192444, + "learning_rate": 9.82484560089096e-05, + "loss": 1.8605, + "step": 356 + }, + { + "epoch": 0.10836242221884959, + "grad_norm": 0.41388946771621704, + "learning_rate": 9.82433937430394e-05, + "loss": 1.812, + "step": 357 + }, + { + "epoch": 0.10866595841554105, + "grad_norm": 0.3470607101917267, + "learning_rate": 9.823833147716919e-05, + "loss": 2.1914, + "step": 358 + }, + { + "epoch": 0.10896949461223251, + "grad_norm": 0.4417155385017395, + "learning_rate": 9.823326921129898e-05, + "loss": 1.7644, + "step": 359 + }, + { + "epoch": 0.10927303080892396, + "grad_norm": 0.33910539746284485, + "learning_rate": 9.822820694542878e-05, + "loss": 1.8821, + "step": 360 + }, + { + "epoch": 0.10957656700561542, + "grad_norm": 0.36742356419563293, + "learning_rate": 9.822314467955857e-05, + "loss": 1.9684, + "step": 361 + }, + { + "epoch": 0.10988010320230687, + "grad_norm": 0.407844603061676, + "learning_rate": 9.821808241368837e-05, + "loss": 1.8797, + "step": 362 + }, + { + "epoch": 0.11018363939899833, + "grad_norm": 0.4090898036956787, + "learning_rate": 9.821302014781816e-05, + "loss": 1.8401, + "step": 363 + }, + { + "epoch": 0.11048717559568978, + "grad_norm": 0.3852720260620117, + "learning_rate": 9.820795788194796e-05, + "loss": 1.6887, + "step": 364 + }, + { + "epoch": 0.11079071179238124, + "grad_norm": 0.4147186875343323, + "learning_rate": 9.820289561607777e-05, + "loss": 1.7263, + "step": 365 + }, + { + "epoch": 0.11109424798907269, + "grad_norm": 0.7032086849212646, + "learning_rate": 9.819783335020756e-05, + "loss": 1.5382, + "step": 366 + }, + { + "epoch": 0.11139778418576415, + "grad_norm": 0.3547534644603729, + "learning_rate": 9.819277108433736e-05, + "loss": 1.5988, + "step": 367 + }, + { + "epoch": 0.1117013203824556, + "grad_norm": 0.45878785848617554, + "learning_rate": 9.818770881846715e-05, + "loss": 2.2467, + "step": 368 + }, + { + "epoch": 0.11200485657914706, + "grad_norm": 0.39183077216148376, + "learning_rate": 9.818264655259695e-05, + "loss": 1.848, + "step": 369 + }, + { + "epoch": 0.11230839277583853, + "grad_norm": 0.3735283315181732, + "learning_rate": 9.817758428672674e-05, + "loss": 1.6925, + "step": 370 + }, + { + "epoch": 0.11261192897252997, + "grad_norm": 0.3878265917301178, + "learning_rate": 9.817252202085654e-05, + "loss": 2.04, + "step": 371 + }, + { + "epoch": 0.11291546516922144, + "grad_norm": 0.38978812098503113, + "learning_rate": 9.816745975498633e-05, + "loss": 1.869, + "step": 372 + }, + { + "epoch": 0.11321900136591288, + "grad_norm": 0.39212337136268616, + "learning_rate": 9.816239748911613e-05, + "loss": 2.0549, + "step": 373 + }, + { + "epoch": 0.11352253756260434, + "grad_norm": 0.39528506994247437, + "learning_rate": 9.815733522324593e-05, + "loss": 1.5653, + "step": 374 + }, + { + "epoch": 0.11382607375929579, + "grad_norm": 0.4226018786430359, + "learning_rate": 9.815227295737573e-05, + "loss": 1.6231, + "step": 375 + }, + { + "epoch": 0.11412960995598725, + "grad_norm": 0.3577810823917389, + "learning_rate": 9.814721069150552e-05, + "loss": 1.9599, + "step": 376 + }, + { + "epoch": 0.1144331461526787, + "grad_norm": 0.33580708503723145, + "learning_rate": 9.814214842563532e-05, + "loss": 2.0419, + "step": 377 + }, + { + "epoch": 0.11473668234937016, + "grad_norm": 0.38860392570495605, + "learning_rate": 9.813708615976511e-05, + "loss": 1.7186, + "step": 378 + }, + { + "epoch": 0.11504021854606161, + "grad_norm": 0.38994479179382324, + "learning_rate": 9.813202389389491e-05, + "loss": 2.1848, + "step": 379 + }, + { + "epoch": 0.11534375474275307, + "grad_norm": 0.3947262763977051, + "learning_rate": 9.81269616280247e-05, + "loss": 2.1868, + "step": 380 + }, + { + "epoch": 0.11564729093944452, + "grad_norm": 0.3112877607345581, + "learning_rate": 9.81218993621545e-05, + "loss": 1.8604, + "step": 381 + }, + { + "epoch": 0.11595082713613598, + "grad_norm": 0.375689834356308, + "learning_rate": 9.811683709628429e-05, + "loss": 2.0418, + "step": 382 + }, + { + "epoch": 0.11625436333282745, + "grad_norm": 0.34537243843078613, + "learning_rate": 9.81117748304141e-05, + "loss": 1.8874, + "step": 383 + }, + { + "epoch": 0.1165578995295189, + "grad_norm": 0.5077370405197144, + "learning_rate": 9.81067125645439e-05, + "loss": 1.7497, + "step": 384 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.3703441023826599, + "learning_rate": 9.810165029867369e-05, + "loss": 1.781, + "step": 385 + }, + { + "epoch": 0.1171649719229018, + "grad_norm": 0.4386610984802246, + "learning_rate": 9.809658803280348e-05, + "loss": 1.8428, + "step": 386 + }, + { + "epoch": 0.11746850811959327, + "grad_norm": 0.37781745195388794, + "learning_rate": 9.809152576693328e-05, + "loss": 2.0384, + "step": 387 + }, + { + "epoch": 0.11777204431628471, + "grad_norm": 0.38956716656684875, + "learning_rate": 9.808646350106307e-05, + "loss": 2.3534, + "step": 388 + }, + { + "epoch": 0.11807558051297617, + "grad_norm": 0.3444838523864746, + "learning_rate": 9.808140123519288e-05, + "loss": 1.921, + "step": 389 + }, + { + "epoch": 0.11837911670966762, + "grad_norm": 0.39881742000579834, + "learning_rate": 9.807633896932268e-05, + "loss": 2.1758, + "step": 390 + }, + { + "epoch": 0.11868265290635908, + "grad_norm": 0.384226530790329, + "learning_rate": 9.807127670345247e-05, + "loss": 1.7651, + "step": 391 + }, + { + "epoch": 0.11898618910305053, + "grad_norm": 0.36255109310150146, + "learning_rate": 9.806621443758227e-05, + "loss": 1.8122, + "step": 392 + }, + { + "epoch": 0.119289725299742, + "grad_norm": 0.3627421259880066, + "learning_rate": 9.806115217171206e-05, + "loss": 1.6304, + "step": 393 + }, + { + "epoch": 0.11959326149643346, + "grad_norm": 0.8936781883239746, + "learning_rate": 9.805608990584187e-05, + "loss": 1.8827, + "step": 394 + }, + { + "epoch": 0.1198967976931249, + "grad_norm": 0.5008642673492432, + "learning_rate": 9.805102763997166e-05, + "loss": 1.3597, + "step": 395 + }, + { + "epoch": 0.12020033388981637, + "grad_norm": 0.4444289207458496, + "learning_rate": 9.804596537410146e-05, + "loss": 2.1768, + "step": 396 + }, + { + "epoch": 0.12050387008650781, + "grad_norm": 0.3963356912136078, + "learning_rate": 9.804090310823125e-05, + "loss": 1.8373, + "step": 397 + }, + { + "epoch": 0.12080740628319928, + "grad_norm": 0.44095271825790405, + "learning_rate": 9.803584084236105e-05, + "loss": 1.7893, + "step": 398 + }, + { + "epoch": 0.12111094247989072, + "grad_norm": 0.4162418246269226, + "learning_rate": 9.803077857649084e-05, + "loss": 1.7482, + "step": 399 + }, + { + "epoch": 0.12141447867658219, + "grad_norm": 0.3853035271167755, + "learning_rate": 9.802571631062064e-05, + "loss": 1.6274, + "step": 400 + }, + { + "epoch": 0.12171801487327363, + "grad_norm": 1.1697463989257812, + "learning_rate": 9.802065404475043e-05, + "loss": 2.2254, + "step": 401 + }, + { + "epoch": 0.1220215510699651, + "grad_norm": 0.3899803161621094, + "learning_rate": 9.801559177888023e-05, + "loss": 1.9754, + "step": 402 + }, + { + "epoch": 0.12232508726665654, + "grad_norm": 0.43946412205696106, + "learning_rate": 9.801052951301002e-05, + "loss": 2.1184, + "step": 403 + }, + { + "epoch": 0.122628623463348, + "grad_norm": 0.46882718801498413, + "learning_rate": 9.800546724713983e-05, + "loss": 1.4423, + "step": 404 + }, + { + "epoch": 0.12293215966003945, + "grad_norm": 0.4379485547542572, + "learning_rate": 9.800040498126963e-05, + "loss": 2.0614, + "step": 405 + }, + { + "epoch": 0.12323569585673091, + "grad_norm": 0.3837740123271942, + "learning_rate": 9.799534271539942e-05, + "loss": 1.9974, + "step": 406 + }, + { + "epoch": 0.12353923205342238, + "grad_norm": 0.35403695702552795, + "learning_rate": 9.799028044952922e-05, + "loss": 1.5693, + "step": 407 + }, + { + "epoch": 0.12384276825011382, + "grad_norm": 0.4070426821708679, + "learning_rate": 9.798521818365901e-05, + "loss": 1.8704, + "step": 408 + }, + { + "epoch": 0.12414630444680529, + "grad_norm": 0.4301077425479889, + "learning_rate": 9.79801559177888e-05, + "loss": 1.077, + "step": 409 + }, + { + "epoch": 0.12444984064349673, + "grad_norm": 0.37687429785728455, + "learning_rate": 9.79750936519186e-05, + "loss": 1.7323, + "step": 410 + }, + { + "epoch": 0.1247533768401882, + "grad_norm": 0.37393873929977417, + "learning_rate": 9.79700313860484e-05, + "loss": 1.9532, + "step": 411 + }, + { + "epoch": 0.12505691303687966, + "grad_norm": 0.4518846869468689, + "learning_rate": 9.796496912017819e-05, + "loss": 2.0123, + "step": 412 + }, + { + "epoch": 0.1253604492335711, + "grad_norm": 0.39417609572410583, + "learning_rate": 9.7959906854308e-05, + "loss": 2.2669, + "step": 413 + }, + { + "epoch": 0.12566398543026255, + "grad_norm": 0.3802976608276367, + "learning_rate": 9.795484458843779e-05, + "loss": 2.0506, + "step": 414 + }, + { + "epoch": 0.12596752162695402, + "grad_norm": 1.3118431568145752, + "learning_rate": 9.794978232256759e-05, + "loss": 2.2551, + "step": 415 + }, + { + "epoch": 0.12627105782364548, + "grad_norm": 0.9459638595581055, + "learning_rate": 9.794472005669738e-05, + "loss": 1.7829, + "step": 416 + }, + { + "epoch": 0.1265745940203369, + "grad_norm": 0.571232795715332, + "learning_rate": 9.793965779082718e-05, + "loss": 1.7768, + "step": 417 + }, + { + "epoch": 0.12687813021702837, + "grad_norm": 0.3973385989665985, + "learning_rate": 9.793459552495697e-05, + "loss": 1.88, + "step": 418 + }, + { + "epoch": 0.12718166641371983, + "grad_norm": 0.3883122503757477, + "learning_rate": 9.792953325908677e-05, + "loss": 1.9592, + "step": 419 + }, + { + "epoch": 0.1274852026104113, + "grad_norm": 0.40379586815834045, + "learning_rate": 9.792447099321656e-05, + "loss": 1.9697, + "step": 420 + }, + { + "epoch": 0.12778873880710276, + "grad_norm": 0.3288556635379791, + "learning_rate": 9.791940872734636e-05, + "loss": 1.7282, + "step": 421 + }, + { + "epoch": 0.1280922750037942, + "grad_norm": 0.3872746527194977, + "learning_rate": 9.791434646147616e-05, + "loss": 1.9348, + "step": 422 + }, + { + "epoch": 0.12839581120048565, + "grad_norm": 0.37058207392692566, + "learning_rate": 9.790928419560596e-05, + "loss": 1.5684, + "step": 423 + }, + { + "epoch": 0.12869934739717712, + "grad_norm": 0.37466561794281006, + "learning_rate": 9.790422192973575e-05, + "loss": 1.9535, + "step": 424 + }, + { + "epoch": 0.12900288359386858, + "grad_norm": 0.32176846265792847, + "learning_rate": 9.789915966386555e-05, + "loss": 1.8537, + "step": 425 + }, + { + "epoch": 0.12930641979056, + "grad_norm": 0.37653467059135437, + "learning_rate": 9.789409739799534e-05, + "loss": 2.0701, + "step": 426 + }, + { + "epoch": 0.12960995598725147, + "grad_norm": 0.38768434524536133, + "learning_rate": 9.788903513212514e-05, + "loss": 1.731, + "step": 427 + }, + { + "epoch": 0.12991349218394294, + "grad_norm": 0.5139635801315308, + "learning_rate": 9.788397286625493e-05, + "loss": 2.4437, + "step": 428 + }, + { + "epoch": 0.1302170283806344, + "grad_norm": 0.3759630024433136, + "learning_rate": 9.787891060038473e-05, + "loss": 2.0918, + "step": 429 + }, + { + "epoch": 0.13052056457732586, + "grad_norm": 0.3718818426132202, + "learning_rate": 9.787384833451452e-05, + "loss": 1.5854, + "step": 430 + }, + { + "epoch": 0.1308241007740173, + "grad_norm": 0.6460405588150024, + "learning_rate": 9.786878606864432e-05, + "loss": 2.2442, + "step": 431 + }, + { + "epoch": 0.13112763697070876, + "grad_norm": 0.40393388271331787, + "learning_rate": 9.786372380277413e-05, + "loss": 1.728, + "step": 432 + }, + { + "epoch": 0.13143117316740022, + "grad_norm": 0.3772658407688141, + "learning_rate": 9.785866153690393e-05, + "loss": 1.668, + "step": 433 + }, + { + "epoch": 0.13173470936409168, + "grad_norm": 2.5252649784088135, + "learning_rate": 9.785359927103373e-05, + "loss": 1.8864, + "step": 434 + }, + { + "epoch": 0.1320382455607831, + "grad_norm": 0.42327219247817993, + "learning_rate": 9.784853700516352e-05, + "loss": 2.3174, + "step": 435 + }, + { + "epoch": 0.13234178175747457, + "grad_norm": 0.3689473867416382, + "learning_rate": 9.784347473929332e-05, + "loss": 1.9671, + "step": 436 + }, + { + "epoch": 0.13264531795416604, + "grad_norm": 0.37554243206977844, + "learning_rate": 9.783841247342311e-05, + "loss": 1.783, + "step": 437 + }, + { + "epoch": 0.1329488541508575, + "grad_norm": 0.409587025642395, + "learning_rate": 9.783335020755291e-05, + "loss": 2.0385, + "step": 438 + }, + { + "epoch": 0.13325239034754893, + "grad_norm": 0.349252849817276, + "learning_rate": 9.78282879416827e-05, + "loss": 1.8785, + "step": 439 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.36687588691711426, + "learning_rate": 9.78232256758125e-05, + "loss": 2.1174, + "step": 440 + }, + { + "epoch": 0.13385946274093186, + "grad_norm": 0.40221846103668213, + "learning_rate": 9.781816340994229e-05, + "loss": 1.8385, + "step": 441 + }, + { + "epoch": 0.13416299893762332, + "grad_norm": 0.5634617805480957, + "learning_rate": 9.781310114407209e-05, + "loss": 1.9316, + "step": 442 + }, + { + "epoch": 0.13446653513431478, + "grad_norm": 0.37704020738601685, + "learning_rate": 9.78080388782019e-05, + "loss": 1.8865, + "step": 443 + }, + { + "epoch": 0.1347700713310062, + "grad_norm": 0.36043843626976013, + "learning_rate": 9.780297661233169e-05, + "loss": 1.585, + "step": 444 + }, + { + "epoch": 0.13507360752769768, + "grad_norm": 0.33643844723701477, + "learning_rate": 9.779791434646149e-05, + "loss": 1.8098, + "step": 445 + }, + { + "epoch": 0.13537714372438914, + "grad_norm": 0.6782101988792419, + "learning_rate": 9.779285208059128e-05, + "loss": 2.0468, + "step": 446 + }, + { + "epoch": 0.1356806799210806, + "grad_norm": 0.38101980090141296, + "learning_rate": 9.778778981472108e-05, + "loss": 2.0624, + "step": 447 + }, + { + "epoch": 0.13598421611777203, + "grad_norm": 0.399311900138855, + "learning_rate": 9.778272754885087e-05, + "loss": 2.1652, + "step": 448 + }, + { + "epoch": 0.1362877523144635, + "grad_norm": 0.3491426706314087, + "learning_rate": 9.777766528298066e-05, + "loss": 1.9092, + "step": 449 + }, + { + "epoch": 0.13659128851115496, + "grad_norm": 0.3654717803001404, + "learning_rate": 9.777260301711046e-05, + "loss": 1.9773, + "step": 450 + }, + { + "epoch": 0.13689482470784642, + "grad_norm": 0.394699364900589, + "learning_rate": 9.776754075124025e-05, + "loss": 2.1568, + "step": 451 + }, + { + "epoch": 0.13719836090453785, + "grad_norm": 0.3601212203502655, + "learning_rate": 9.776247848537006e-05, + "loss": 1.8744, + "step": 452 + }, + { + "epoch": 0.13750189710122931, + "grad_norm": 0.40716952085494995, + "learning_rate": 9.775741621949986e-05, + "loss": 2.1052, + "step": 453 + }, + { + "epoch": 0.13780543329792078, + "grad_norm": 0.37777504324913025, + "learning_rate": 9.775235395362965e-05, + "loss": 1.8896, + "step": 454 + }, + { + "epoch": 0.13810896949461224, + "grad_norm": 0.368600994348526, + "learning_rate": 9.774729168775945e-05, + "loss": 1.8285, + "step": 455 + }, + { + "epoch": 0.1384125056913037, + "grad_norm": 0.41742029786109924, + "learning_rate": 9.774222942188924e-05, + "loss": 1.8286, + "step": 456 + }, + { + "epoch": 0.13871604188799513, + "grad_norm": 0.40132156014442444, + "learning_rate": 9.773716715601904e-05, + "loss": 1.9515, + "step": 457 + }, + { + "epoch": 0.1390195780846866, + "grad_norm": 0.44473376870155334, + "learning_rate": 9.773210489014883e-05, + "loss": 1.8715, + "step": 458 + }, + { + "epoch": 0.13932311428137806, + "grad_norm": 0.40146371722221375, + "learning_rate": 9.772704262427863e-05, + "loss": 2.1469, + "step": 459 + }, + { + "epoch": 0.13962665047806952, + "grad_norm": 0.3863317370414734, + "learning_rate": 9.772198035840842e-05, + "loss": 1.9215, + "step": 460 + }, + { + "epoch": 0.13993018667476095, + "grad_norm": 0.40235334634780884, + "learning_rate": 9.771691809253823e-05, + "loss": 2.1276, + "step": 461 + }, + { + "epoch": 0.14023372287145242, + "grad_norm": 0.46011632680892944, + "learning_rate": 9.771185582666802e-05, + "loss": 1.244, + "step": 462 + }, + { + "epoch": 0.14053725906814388, + "grad_norm": 0.3428272008895874, + "learning_rate": 9.770679356079782e-05, + "loss": 1.7991, + "step": 463 + }, + { + "epoch": 0.14084079526483534, + "grad_norm": 0.39976757764816284, + "learning_rate": 9.770173129492761e-05, + "loss": 1.7166, + "step": 464 + }, + { + "epoch": 0.1411443314615268, + "grad_norm": 0.3258446753025055, + "learning_rate": 9.769666902905741e-05, + "loss": 1.677, + "step": 465 + }, + { + "epoch": 0.14144786765821823, + "grad_norm": 0.3950905501842499, + "learning_rate": 9.76916067631872e-05, + "loss": 2.0122, + "step": 466 + }, + { + "epoch": 0.1417514038549097, + "grad_norm": 0.39712047576904297, + "learning_rate": 9.7686544497317e-05, + "loss": 1.7262, + "step": 467 + }, + { + "epoch": 0.14205494005160116, + "grad_norm": 0.8331599235534668, + "learning_rate": 9.768148223144679e-05, + "loss": 1.9852, + "step": 468 + }, + { + "epoch": 0.14235847624829262, + "grad_norm": 0.3578427731990814, + "learning_rate": 9.767641996557659e-05, + "loss": 1.8249, + "step": 469 + }, + { + "epoch": 0.14266201244498405, + "grad_norm": 0.3736058473587036, + "learning_rate": 9.767135769970638e-05, + "loss": 1.43, + "step": 470 + }, + { + "epoch": 0.14296554864167552, + "grad_norm": 0.48153185844421387, + "learning_rate": 9.766629543383619e-05, + "loss": 1.8667, + "step": 471 + }, + { + "epoch": 0.14326908483836698, + "grad_norm": 0.3924524188041687, + "learning_rate": 9.766123316796599e-05, + "loss": 2.0385, + "step": 472 + }, + { + "epoch": 0.14357262103505844, + "grad_norm": 0.38956940174102783, + "learning_rate": 9.765617090209578e-05, + "loss": 1.3157, + "step": 473 + }, + { + "epoch": 0.14387615723174987, + "grad_norm": 0.4032903015613556, + "learning_rate": 9.765110863622558e-05, + "loss": 1.8793, + "step": 474 + }, + { + "epoch": 0.14417969342844134, + "grad_norm": 0.5116568207740784, + "learning_rate": 9.764604637035537e-05, + "loss": 1.7658, + "step": 475 + }, + { + "epoch": 0.1444832296251328, + "grad_norm": 0.3981756269931793, + "learning_rate": 9.764098410448517e-05, + "loss": 1.8087, + "step": 476 + }, + { + "epoch": 0.14478676582182426, + "grad_norm": 0.43181854486465454, + "learning_rate": 9.763592183861496e-05, + "loss": 1.5241, + "step": 477 + }, + { + "epoch": 0.14509030201851572, + "grad_norm": 0.4172961413860321, + "learning_rate": 9.763085957274477e-05, + "loss": 1.8318, + "step": 478 + }, + { + "epoch": 0.14539383821520716, + "grad_norm": 0.4135033190250397, + "learning_rate": 9.762579730687456e-05, + "loss": 2.0783, + "step": 479 + }, + { + "epoch": 0.14569737441189862, + "grad_norm": 0.36482739448547363, + "learning_rate": 9.762073504100436e-05, + "loss": 2.2524, + "step": 480 + }, + { + "epoch": 0.14600091060859008, + "grad_norm": 0.3704656958580017, + "learning_rate": 9.761567277513415e-05, + "loss": 2.0369, + "step": 481 + }, + { + "epoch": 0.14630444680528154, + "grad_norm": 1.588393211364746, + "learning_rate": 9.761061050926396e-05, + "loss": 1.8041, + "step": 482 + }, + { + "epoch": 0.14660798300197297, + "grad_norm": 0.3309743404388428, + "learning_rate": 9.760554824339376e-05, + "loss": 1.8373, + "step": 483 + }, + { + "epoch": 0.14691151919866444, + "grad_norm": 0.34598830342292786, + "learning_rate": 9.760048597752355e-05, + "loss": 1.6249, + "step": 484 + }, + { + "epoch": 0.1472150553953559, + "grad_norm": 0.3433639109134674, + "learning_rate": 9.759542371165335e-05, + "loss": 1.9454, + "step": 485 + }, + { + "epoch": 0.14751859159204736, + "grad_norm": 0.3801734149456024, + "learning_rate": 9.759036144578314e-05, + "loss": 2.1067, + "step": 486 + }, + { + "epoch": 0.1478221277887388, + "grad_norm": 0.36811041831970215, + "learning_rate": 9.758529917991293e-05, + "loss": 1.8642, + "step": 487 + }, + { + "epoch": 0.14812566398543026, + "grad_norm": 0.3999156355857849, + "learning_rate": 9.758023691404273e-05, + "loss": 2.1482, + "step": 488 + }, + { + "epoch": 0.14842920018212172, + "grad_norm": 0.7651489973068237, + "learning_rate": 9.757517464817252e-05, + "loss": 1.8213, + "step": 489 + }, + { + "epoch": 0.14873273637881318, + "grad_norm": 0.3491712808609009, + "learning_rate": 9.757011238230232e-05, + "loss": 2.1047, + "step": 490 + }, + { + "epoch": 0.14903627257550464, + "grad_norm": 1.028256893157959, + "learning_rate": 9.756505011643213e-05, + "loss": 2.0519, + "step": 491 + }, + { + "epoch": 0.14933980877219608, + "grad_norm": 0.5957101583480835, + "learning_rate": 9.755998785056192e-05, + "loss": 2.1236, + "step": 492 + }, + { + "epoch": 0.14964334496888754, + "grad_norm": 0.40934717655181885, + "learning_rate": 9.755492558469172e-05, + "loss": 1.5391, + "step": 493 + }, + { + "epoch": 0.149946881165579, + "grad_norm": 0.4403507709503174, + "learning_rate": 9.754986331882151e-05, + "loss": 1.8388, + "step": 494 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.4258563220500946, + "learning_rate": 9.754480105295131e-05, + "loss": 1.8092, + "step": 495 + }, + { + "epoch": 0.1505539535589619, + "grad_norm": 0.3594823181629181, + "learning_rate": 9.75397387870811e-05, + "loss": 1.7195, + "step": 496 + }, + { + "epoch": 0.15085748975565336, + "grad_norm": 0.30373120307922363, + "learning_rate": 9.75346765212109e-05, + "loss": 1.9267, + "step": 497 + }, + { + "epoch": 0.15116102595234482, + "grad_norm": 0.423096626996994, + "learning_rate": 9.752961425534069e-05, + "loss": 2.1559, + "step": 498 + }, + { + "epoch": 0.15146456214903628, + "grad_norm": 0.36935552954673767, + "learning_rate": 9.752455198947049e-05, + "loss": 2.0357, + "step": 499 + }, + { + "epoch": 0.15176809834572771, + "grad_norm": 0.7172725200653076, + "learning_rate": 9.75194897236003e-05, + "loss": 2.0973, + "step": 500 + }, + { + "epoch": 0.15207163454241918, + "grad_norm": 0.36897605657577515, + "learning_rate": 9.751442745773009e-05, + "loss": 2.1672, + "step": 501 + }, + { + "epoch": 0.15237517073911064, + "grad_norm": 0.35079488158226013, + "learning_rate": 9.750936519185988e-05, + "loss": 2.0808, + "step": 502 + }, + { + "epoch": 0.1526787069358021, + "grad_norm": 0.37833186984062195, + "learning_rate": 9.750430292598968e-05, + "loss": 1.8393, + "step": 503 + }, + { + "epoch": 0.15298224313249356, + "grad_norm": 0.3969264328479767, + "learning_rate": 9.749924066011947e-05, + "loss": 2.1213, + "step": 504 + }, + { + "epoch": 0.153285779329185, + "grad_norm": 0.30432841181755066, + "learning_rate": 9.749417839424927e-05, + "loss": 1.6397, + "step": 505 + }, + { + "epoch": 0.15358931552587646, + "grad_norm": 0.30847886204719543, + "learning_rate": 9.748911612837906e-05, + "loss": 1.6455, + "step": 506 + }, + { + "epoch": 0.15389285172256792, + "grad_norm": 0.38480496406555176, + "learning_rate": 9.748405386250886e-05, + "loss": 1.803, + "step": 507 + }, + { + "epoch": 0.15419638791925938, + "grad_norm": 0.48439183831214905, + "learning_rate": 9.747899159663865e-05, + "loss": 1.6892, + "step": 508 + }, + { + "epoch": 0.15449992411595082, + "grad_norm": 0.5124354362487793, + "learning_rate": 9.747392933076845e-05, + "loss": 2.24, + "step": 509 + }, + { + "epoch": 0.15480346031264228, + "grad_norm": 0.4051717221736908, + "learning_rate": 9.746886706489826e-05, + "loss": 1.8621, + "step": 510 + }, + { + "epoch": 0.15510699650933374, + "grad_norm": 0.6452261209487915, + "learning_rate": 9.746380479902805e-05, + "loss": 1.7043, + "step": 511 + }, + { + "epoch": 0.1554105327060252, + "grad_norm": 0.5453522801399231, + "learning_rate": 9.745874253315785e-05, + "loss": 1.7325, + "step": 512 + }, + { + "epoch": 0.15571406890271666, + "grad_norm": 1.0983595848083496, + "learning_rate": 9.745368026728764e-05, + "loss": 2.169, + "step": 513 + }, + { + "epoch": 0.1560176050994081, + "grad_norm": 0.3821035623550415, + "learning_rate": 9.744861800141744e-05, + "loss": 2.3305, + "step": 514 + }, + { + "epoch": 0.15632114129609956, + "grad_norm": 0.3694508969783783, + "learning_rate": 9.744355573554723e-05, + "loss": 1.8453, + "step": 515 + }, + { + "epoch": 0.15662467749279102, + "grad_norm": 0.3837510943412781, + "learning_rate": 9.743849346967702e-05, + "loss": 1.9679, + "step": 516 + }, + { + "epoch": 0.15692821368948248, + "grad_norm": 0.41427966952323914, + "learning_rate": 9.743343120380682e-05, + "loss": 1.9331, + "step": 517 + }, + { + "epoch": 0.15723174988617392, + "grad_norm": 0.34252259135246277, + "learning_rate": 9.742836893793661e-05, + "loss": 1.7938, + "step": 518 + }, + { + "epoch": 0.15753528608286538, + "grad_norm": 0.4043283462524414, + "learning_rate": 9.742330667206642e-05, + "loss": 1.4037, + "step": 519 + }, + { + "epoch": 0.15783882227955684, + "grad_norm": 0.4225389361381531, + "learning_rate": 9.741824440619622e-05, + "loss": 1.6224, + "step": 520 + }, + { + "epoch": 0.1581423584762483, + "grad_norm": 0.377590537071228, + "learning_rate": 9.741318214032601e-05, + "loss": 2.0567, + "step": 521 + }, + { + "epoch": 0.15844589467293974, + "grad_norm": 0.46170124411582947, + "learning_rate": 9.740811987445582e-05, + "loss": 2.0449, + "step": 522 + }, + { + "epoch": 0.1587494308696312, + "grad_norm": 0.3752427399158478, + "learning_rate": 9.740305760858562e-05, + "loss": 1.8207, + "step": 523 + }, + { + "epoch": 0.15905296706632266, + "grad_norm": 0.390803724527359, + "learning_rate": 9.739799534271541e-05, + "loss": 2.0781, + "step": 524 + }, + { + "epoch": 0.15935650326301412, + "grad_norm": 0.38587453961372375, + "learning_rate": 9.73929330768452e-05, + "loss": 1.9932, + "step": 525 + }, + { + "epoch": 0.15966003945970558, + "grad_norm": 0.4154350459575653, + "learning_rate": 9.7387870810975e-05, + "loss": 1.7649, + "step": 526 + }, + { + "epoch": 0.15996357565639702, + "grad_norm": 0.3698589503765106, + "learning_rate": 9.73828085451048e-05, + "loss": 1.6921, + "step": 527 + }, + { + "epoch": 0.16026711185308848, + "grad_norm": 0.4110312759876251, + "learning_rate": 9.737774627923459e-05, + "loss": 1.1834, + "step": 528 + }, + { + "epoch": 0.16057064804977994, + "grad_norm": 0.4140758812427521, + "learning_rate": 9.737268401336438e-05, + "loss": 1.8354, + "step": 529 + }, + { + "epoch": 0.1608741842464714, + "grad_norm": 0.38738423585891724, + "learning_rate": 9.736762174749419e-05, + "loss": 1.9223, + "step": 530 + }, + { + "epoch": 0.16117772044316284, + "grad_norm": 0.4055260717868805, + "learning_rate": 9.736255948162399e-05, + "loss": 1.7802, + "step": 531 + }, + { + "epoch": 0.1614812566398543, + "grad_norm": 0.44946524500846863, + "learning_rate": 9.735749721575378e-05, + "loss": 1.8654, + "step": 532 + }, + { + "epoch": 0.16178479283654576, + "grad_norm": 0.43206432461738586, + "learning_rate": 9.735243494988358e-05, + "loss": 1.7607, + "step": 533 + }, + { + "epoch": 0.16208832903323722, + "grad_norm": 0.5007991194725037, + "learning_rate": 9.734737268401337e-05, + "loss": 1.9378, + "step": 534 + }, + { + "epoch": 0.16239186522992866, + "grad_norm": 0.48757919669151306, + "learning_rate": 9.734231041814317e-05, + "loss": 2.1829, + "step": 535 + }, + { + "epoch": 0.16269540142662012, + "grad_norm": 0.4159701466560364, + "learning_rate": 9.733724815227296e-05, + "loss": 1.8847, + "step": 536 + }, + { + "epoch": 0.16299893762331158, + "grad_norm": 0.40922749042510986, + "learning_rate": 9.733218588640276e-05, + "loss": 1.4376, + "step": 537 + }, + { + "epoch": 0.16330247382000304, + "grad_norm": 0.33677083253860474, + "learning_rate": 9.732712362053255e-05, + "loss": 1.9568, + "step": 538 + }, + { + "epoch": 0.1636060100166945, + "grad_norm": 0.3255022168159485, + "learning_rate": 9.732206135466236e-05, + "loss": 1.9949, + "step": 539 + }, + { + "epoch": 0.16390954621338594, + "grad_norm": 0.3848338723182678, + "learning_rate": 9.731699908879215e-05, + "loss": 2.042, + "step": 540 + }, + { + "epoch": 0.1642130824100774, + "grad_norm": 0.3888263404369354, + "learning_rate": 9.731193682292195e-05, + "loss": 1.885, + "step": 541 + }, + { + "epoch": 0.16451661860676886, + "grad_norm": 0.40090805292129517, + "learning_rate": 9.730687455705174e-05, + "loss": 1.9093, + "step": 542 + }, + { + "epoch": 0.16482015480346032, + "grad_norm": 0.4106220602989197, + "learning_rate": 9.730181229118154e-05, + "loss": 1.8392, + "step": 543 + }, + { + "epoch": 0.16512369100015176, + "grad_norm": 0.3483395278453827, + "learning_rate": 9.729675002531133e-05, + "loss": 2.0235, + "step": 544 + }, + { + "epoch": 0.16542722719684322, + "grad_norm": 0.3686208128929138, + "learning_rate": 9.729168775944113e-05, + "loss": 1.9218, + "step": 545 + }, + { + "epoch": 0.16573076339353468, + "grad_norm": 0.36063849925994873, + "learning_rate": 9.728662549357092e-05, + "loss": 1.9334, + "step": 546 + }, + { + "epoch": 0.16603429959022614, + "grad_norm": 0.39365142583847046, + "learning_rate": 9.728156322770072e-05, + "loss": 1.9825, + "step": 547 + }, + { + "epoch": 0.16633783578691758, + "grad_norm": 0.4062787592411041, + "learning_rate": 9.727650096183051e-05, + "loss": 1.521, + "step": 548 + }, + { + "epoch": 0.16664137198360904, + "grad_norm": 0.37347134947776794, + "learning_rate": 9.727143869596032e-05, + "loss": 1.9356, + "step": 549 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3538997173309326, + "learning_rate": 9.726637643009012e-05, + "loss": 1.845, + "step": 550 + }, + { + "epoch": 0.16724844437699196, + "grad_norm": 0.3868335783481598, + "learning_rate": 9.726131416421991e-05, + "loss": 1.9803, + "step": 551 + }, + { + "epoch": 0.16755198057368342, + "grad_norm": 0.34705451130867004, + "learning_rate": 9.72562518983497e-05, + "loss": 2.0866, + "step": 552 + }, + { + "epoch": 0.16785551677037486, + "grad_norm": 0.3794872462749481, + "learning_rate": 9.72511896324795e-05, + "loss": 2.094, + "step": 553 + }, + { + "epoch": 0.16815905296706632, + "grad_norm": 0.5801231861114502, + "learning_rate": 9.72461273666093e-05, + "loss": 1.7851, + "step": 554 + }, + { + "epoch": 0.16846258916375778, + "grad_norm": 0.3076344132423401, + "learning_rate": 9.724106510073909e-05, + "loss": 1.5188, + "step": 555 + }, + { + "epoch": 0.16876612536044924, + "grad_norm": 0.3552989363670349, + "learning_rate": 9.723600283486888e-05, + "loss": 2.1063, + "step": 556 + }, + { + "epoch": 0.16906966155714068, + "grad_norm": 0.36939847469329834, + "learning_rate": 9.723094056899868e-05, + "loss": 1.7648, + "step": 557 + }, + { + "epoch": 0.16937319775383214, + "grad_norm": 0.358634889125824, + "learning_rate": 9.722587830312849e-05, + "loss": 1.8007, + "step": 558 + }, + { + "epoch": 0.1696767339505236, + "grad_norm": 0.39962029457092285, + "learning_rate": 9.722081603725828e-05, + "loss": 1.8845, + "step": 559 + }, + { + "epoch": 0.16998027014721506, + "grad_norm": 0.4099076986312866, + "learning_rate": 9.721575377138808e-05, + "loss": 1.8894, + "step": 560 + }, + { + "epoch": 0.17028380634390652, + "grad_norm": 0.3610551655292511, + "learning_rate": 9.721069150551787e-05, + "loss": 1.8089, + "step": 561 + }, + { + "epoch": 0.17058734254059796, + "grad_norm": 0.5951200723648071, + "learning_rate": 9.720562923964767e-05, + "loss": 1.6966, + "step": 562 + }, + { + "epoch": 0.17089087873728942, + "grad_norm": 0.562522292137146, + "learning_rate": 9.720056697377746e-05, + "loss": 1.7704, + "step": 563 + }, + { + "epoch": 0.17119441493398088, + "grad_norm": 0.6662526726722717, + "learning_rate": 9.719550470790726e-05, + "loss": 1.7714, + "step": 564 + }, + { + "epoch": 0.17149795113067234, + "grad_norm": 0.44034865498542786, + "learning_rate": 9.719044244203705e-05, + "loss": 2.1042, + "step": 565 + }, + { + "epoch": 0.17180148732736378, + "grad_norm": 0.39868202805519104, + "learning_rate": 9.718538017616685e-05, + "loss": 1.952, + "step": 566 + }, + { + "epoch": 0.17210502352405524, + "grad_norm": 0.3427380621433258, + "learning_rate": 9.718031791029665e-05, + "loss": 2.037, + "step": 567 + }, + { + "epoch": 0.1724085597207467, + "grad_norm": 0.37980929017066956, + "learning_rate": 9.717525564442645e-05, + "loss": 1.5378, + "step": 568 + }, + { + "epoch": 0.17271209591743816, + "grad_norm": 0.32314518094062805, + "learning_rate": 9.717019337855626e-05, + "loss": 1.6191, + "step": 569 + }, + { + "epoch": 0.1730156321141296, + "grad_norm": 0.40600740909576416, + "learning_rate": 9.716513111268605e-05, + "loss": 1.6055, + "step": 570 + }, + { + "epoch": 0.17331916831082106, + "grad_norm": 0.37318041920661926, + "learning_rate": 9.716006884681585e-05, + "loss": 1.8666, + "step": 571 + }, + { + "epoch": 0.17362270450751252, + "grad_norm": 0.3656068444252014, + "learning_rate": 9.715500658094564e-05, + "loss": 1.5983, + "step": 572 + }, + { + "epoch": 0.17392624070420398, + "grad_norm": 0.3546827733516693, + "learning_rate": 9.714994431507544e-05, + "loss": 2.2088, + "step": 573 + }, + { + "epoch": 0.17422977690089544, + "grad_norm": 0.4293152689933777, + "learning_rate": 9.714488204920523e-05, + "loss": 1.803, + "step": 574 + }, + { + "epoch": 0.17453331309758688, + "grad_norm": 0.3790314495563507, + "learning_rate": 9.713981978333503e-05, + "loss": 1.9874, + "step": 575 + }, + { + "epoch": 0.17483684929427834, + "grad_norm": 0.37619829177856445, + "learning_rate": 9.713475751746482e-05, + "loss": 1.9061, + "step": 576 + }, + { + "epoch": 0.1751403854909698, + "grad_norm": 0.36988991498947144, + "learning_rate": 9.712969525159462e-05, + "loss": 1.5463, + "step": 577 + }, + { + "epoch": 0.17544392168766126, + "grad_norm": 0.367721825838089, + "learning_rate": 9.712463298572442e-05, + "loss": 1.6526, + "step": 578 + }, + { + "epoch": 0.1757474578843527, + "grad_norm": 0.39620110392570496, + "learning_rate": 9.711957071985422e-05, + "loss": 2.056, + "step": 579 + }, + { + "epoch": 0.17605099408104416, + "grad_norm": 0.41518276929855347, + "learning_rate": 9.711450845398401e-05, + "loss": 1.6847, + "step": 580 + }, + { + "epoch": 0.17635453027773562, + "grad_norm": 0.3925170302391052, + "learning_rate": 9.710944618811381e-05, + "loss": 1.8476, + "step": 581 + }, + { + "epoch": 0.17665806647442708, + "grad_norm": 0.36658090353012085, + "learning_rate": 9.71043839222436e-05, + "loss": 2.0699, + "step": 582 + }, + { + "epoch": 0.17696160267111852, + "grad_norm": 0.3741433620452881, + "learning_rate": 9.70993216563734e-05, + "loss": 1.9645, + "step": 583 + }, + { + "epoch": 0.17726513886780998, + "grad_norm": 0.3742316663265228, + "learning_rate": 9.709425939050319e-05, + "loss": 2.3717, + "step": 584 + }, + { + "epoch": 0.17756867506450144, + "grad_norm": 0.3796440660953522, + "learning_rate": 9.708919712463299e-05, + "loss": 1.9356, + "step": 585 + }, + { + "epoch": 0.1778722112611929, + "grad_norm": 0.3976511061191559, + "learning_rate": 9.708413485876278e-05, + "loss": 2.1889, + "step": 586 + }, + { + "epoch": 0.17817574745788436, + "grad_norm": 0.34445542097091675, + "learning_rate": 9.707907259289258e-05, + "loss": 1.6535, + "step": 587 + }, + { + "epoch": 0.1784792836545758, + "grad_norm": 0.3982098698616028, + "learning_rate": 9.707401032702239e-05, + "loss": 2.0542, + "step": 588 + }, + { + "epoch": 0.17878281985126726, + "grad_norm": 0.42155295610427856, + "learning_rate": 9.706894806115218e-05, + "loss": 1.4605, + "step": 589 + }, + { + "epoch": 0.17908635604795872, + "grad_norm": 0.36341744661331177, + "learning_rate": 9.706388579528197e-05, + "loss": 1.8069, + "step": 590 + }, + { + "epoch": 0.17938989224465018, + "grad_norm": 0.3715178668498993, + "learning_rate": 9.705882352941177e-05, + "loss": 1.5512, + "step": 591 + }, + { + "epoch": 0.17969342844134162, + "grad_norm": 0.376767635345459, + "learning_rate": 9.705376126354156e-05, + "loss": 1.6027, + "step": 592 + }, + { + "epoch": 0.17999696463803308, + "grad_norm": 0.4033347964286804, + "learning_rate": 9.704869899767136e-05, + "loss": 1.5071, + "step": 593 + }, + { + "epoch": 0.18030050083472454, + "grad_norm": 0.8200478553771973, + "learning_rate": 9.704363673180115e-05, + "loss": 1.924, + "step": 594 + }, + { + "epoch": 0.180604037031416, + "grad_norm": 0.6224507093429565, + "learning_rate": 9.703857446593095e-05, + "loss": 1.9684, + "step": 595 + }, + { + "epoch": 0.18090757322810747, + "grad_norm": 0.32032859325408936, + "learning_rate": 9.703351220006074e-05, + "loss": 1.9478, + "step": 596 + }, + { + "epoch": 0.1812111094247989, + "grad_norm": 0.33331337571144104, + "learning_rate": 9.702844993419055e-05, + "loss": 1.8177, + "step": 597 + }, + { + "epoch": 0.18151464562149036, + "grad_norm": 0.47399207949638367, + "learning_rate": 9.702338766832035e-05, + "loss": 2.07, + "step": 598 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.30480411648750305, + "learning_rate": 9.701832540245014e-05, + "loss": 2.0407, + "step": 599 + }, + { + "epoch": 0.18212171801487329, + "grad_norm": 0.40148988366127014, + "learning_rate": 9.701326313657994e-05, + "loss": 1.8774, + "step": 600 + }, + { + "epoch": 0.18242525421156472, + "grad_norm": 0.3958423137664795, + "learning_rate": 9.700820087070973e-05, + "loss": 1.8462, + "step": 601 + }, + { + "epoch": 0.18272879040825618, + "grad_norm": 0.34824639558792114, + "learning_rate": 9.700313860483953e-05, + "loss": 1.7839, + "step": 602 + }, + { + "epoch": 0.18303232660494764, + "grad_norm": 0.38002872467041016, + "learning_rate": 9.699807633896932e-05, + "loss": 2.3237, + "step": 603 + }, + { + "epoch": 0.1833358628016391, + "grad_norm": 0.37800419330596924, + "learning_rate": 9.699301407309912e-05, + "loss": 1.9375, + "step": 604 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.4041115939617157, + "learning_rate": 9.698795180722891e-05, + "loss": 2.029, + "step": 605 + }, + { + "epoch": 0.183942935195022, + "grad_norm": 0.3697315454483032, + "learning_rate": 9.698288954135872e-05, + "loss": 1.894, + "step": 606 + }, + { + "epoch": 0.18424647139171346, + "grad_norm": 0.3809906542301178, + "learning_rate": 9.697782727548851e-05, + "loss": 1.8242, + "step": 607 + }, + { + "epoch": 0.18455000758840492, + "grad_norm": 0.3997717499732971, + "learning_rate": 9.697276500961831e-05, + "loss": 2.0522, + "step": 608 + }, + { + "epoch": 0.18485354378509639, + "grad_norm": 0.391699880361557, + "learning_rate": 9.69677027437481e-05, + "loss": 1.8521, + "step": 609 + }, + { + "epoch": 0.18515707998178782, + "grad_norm": 0.3667858839035034, + "learning_rate": 9.69626404778779e-05, + "loss": 1.7613, + "step": 610 + }, + { + "epoch": 0.18546061617847928, + "grad_norm": 0.3905411958694458, + "learning_rate": 9.69575782120077e-05, + "loss": 1.8285, + "step": 611 + }, + { + "epoch": 0.18576415237517074, + "grad_norm": 0.4121951758861542, + "learning_rate": 9.69525159461375e-05, + "loss": 1.8104, + "step": 612 + }, + { + "epoch": 0.1860676885718622, + "grad_norm": 0.34977591037750244, + "learning_rate": 9.69474536802673e-05, + "loss": 1.7737, + "step": 613 + }, + { + "epoch": 0.18637122476855364, + "grad_norm": 0.34084367752075195, + "learning_rate": 9.694239141439709e-05, + "loss": 2.0407, + "step": 614 + }, + { + "epoch": 0.1866747609652451, + "grad_norm": 0.35442525148391724, + "learning_rate": 9.693732914852689e-05, + "loss": 1.9152, + "step": 615 + }, + { + "epoch": 0.18697829716193656, + "grad_norm": 0.34404149651527405, + "learning_rate": 9.693226688265668e-05, + "loss": 1.7621, + "step": 616 + }, + { + "epoch": 0.18728183335862802, + "grad_norm": 0.4516477882862091, + "learning_rate": 9.692720461678649e-05, + "loss": 1.7624, + "step": 617 + }, + { + "epoch": 0.18758536955531946, + "grad_norm": 0.3506614565849304, + "learning_rate": 9.692214235091628e-05, + "loss": 1.6627, + "step": 618 + }, + { + "epoch": 0.18788890575201092, + "grad_norm": 0.9165719151496887, + "learning_rate": 9.691708008504608e-05, + "loss": 2.1926, + "step": 619 + }, + { + "epoch": 0.18819244194870238, + "grad_norm": 0.3361871838569641, + "learning_rate": 9.691201781917587e-05, + "loss": 1.5229, + "step": 620 + }, + { + "epoch": 0.18849597814539384, + "grad_norm": 0.32639381289482117, + "learning_rate": 9.690695555330567e-05, + "loss": 1.8778, + "step": 621 + }, + { + "epoch": 0.1887995143420853, + "grad_norm": 0.44261273741722107, + "learning_rate": 9.690189328743546e-05, + "loss": 2.0903, + "step": 622 + }, + { + "epoch": 0.18910305053877674, + "grad_norm": 0.4438890516757965, + "learning_rate": 9.689683102156526e-05, + "loss": 1.772, + "step": 623 + }, + { + "epoch": 0.1894065867354682, + "grad_norm": 0.40160682797431946, + "learning_rate": 9.689176875569505e-05, + "loss": 2.0964, + "step": 624 + }, + { + "epoch": 0.18971012293215966, + "grad_norm": 0.4022195637226105, + "learning_rate": 9.688670648982485e-05, + "loss": 1.7818, + "step": 625 + }, + { + "epoch": 0.19001365912885113, + "grad_norm": 0.4233214855194092, + "learning_rate": 9.688164422395464e-05, + "loss": 1.922, + "step": 626 + }, + { + "epoch": 0.19031719532554256, + "grad_norm": 0.3864254057407379, + "learning_rate": 9.687658195808445e-05, + "loss": 2.0279, + "step": 627 + }, + { + "epoch": 0.19062073152223402, + "grad_norm": 0.36527585983276367, + "learning_rate": 9.687151969221424e-05, + "loss": 2.0732, + "step": 628 + }, + { + "epoch": 0.19092426771892548, + "grad_norm": 0.399237722158432, + "learning_rate": 9.686645742634404e-05, + "loss": 1.8889, + "step": 629 + }, + { + "epoch": 0.19122780391561695, + "grad_norm": 0.3860459625720978, + "learning_rate": 9.686139516047383e-05, + "loss": 1.968, + "step": 630 + }, + { + "epoch": 0.19153134011230838, + "grad_norm": 0.32555973529815674, + "learning_rate": 9.685633289460363e-05, + "loss": 2.0722, + "step": 631 + }, + { + "epoch": 0.19183487630899984, + "grad_norm": 0.6093998551368713, + "learning_rate": 9.685127062873342e-05, + "loss": 1.8553, + "step": 632 + }, + { + "epoch": 0.1921384125056913, + "grad_norm": 0.4218057692050934, + "learning_rate": 9.684620836286322e-05, + "loss": 1.9647, + "step": 633 + }, + { + "epoch": 0.19244194870238276, + "grad_norm": 0.3779148757457733, + "learning_rate": 9.684114609699301e-05, + "loss": 2.0681, + "step": 634 + }, + { + "epoch": 0.19274548489907423, + "grad_norm": 0.3820381760597229, + "learning_rate": 9.683608383112281e-05, + "loss": 2.0603, + "step": 635 + }, + { + "epoch": 0.19304902109576566, + "grad_norm": 0.29337063431739807, + "learning_rate": 9.683102156525262e-05, + "loss": 1.7516, + "step": 636 + }, + { + "epoch": 0.19335255729245712, + "grad_norm": 0.4369249939918518, + "learning_rate": 9.682595929938241e-05, + "loss": 1.9822, + "step": 637 + }, + { + "epoch": 0.19365609348914858, + "grad_norm": 0.3766214847564697, + "learning_rate": 9.68208970335122e-05, + "loss": 1.7229, + "step": 638 + }, + { + "epoch": 0.19395962968584005, + "grad_norm": 0.4765011668205261, + "learning_rate": 9.6815834767642e-05, + "loss": 1.2865, + "step": 639 + }, + { + "epoch": 0.19426316588253148, + "grad_norm": 0.34236472845077515, + "learning_rate": 9.68107725017718e-05, + "loss": 2.1024, + "step": 640 + }, + { + "epoch": 0.19456670207922294, + "grad_norm": 0.398076593875885, + "learning_rate": 9.680571023590159e-05, + "loss": 1.8628, + "step": 641 + }, + { + "epoch": 0.1948702382759144, + "grad_norm": 0.357099711894989, + "learning_rate": 9.680064797003139e-05, + "loss": 2.2163, + "step": 642 + }, + { + "epoch": 0.19517377447260587, + "grad_norm": 0.3296545445919037, + "learning_rate": 9.679558570416118e-05, + "loss": 1.8227, + "step": 643 + }, + { + "epoch": 0.19547731066929733, + "grad_norm": 0.36754927039146423, + "learning_rate": 9.679052343829098e-05, + "loss": 1.7179, + "step": 644 + }, + { + "epoch": 0.19578084686598876, + "grad_norm": 0.37275364995002747, + "learning_rate": 9.678546117242078e-05, + "loss": 1.6782, + "step": 645 + }, + { + "epoch": 0.19608438306268022, + "grad_norm": 0.3951006531715393, + "learning_rate": 9.678039890655058e-05, + "loss": 2.0756, + "step": 646 + }, + { + "epoch": 0.19638791925937168, + "grad_norm": 0.3560970425605774, + "learning_rate": 9.677533664068037e-05, + "loss": 1.8093, + "step": 647 + }, + { + "epoch": 0.19669145545606315, + "grad_norm": 0.31553730368614197, + "learning_rate": 9.677027437481017e-05, + "loss": 1.9174, + "step": 648 + }, + { + "epoch": 0.19699499165275458, + "grad_norm": 0.39949625730514526, + "learning_rate": 9.676521210893996e-05, + "loss": 1.6687, + "step": 649 + }, + { + "epoch": 0.19729852784944604, + "grad_norm": 0.37323635816574097, + "learning_rate": 9.676014984306976e-05, + "loss": 1.8149, + "step": 650 + }, + { + "epoch": 0.1976020640461375, + "grad_norm": 0.43527746200561523, + "learning_rate": 9.675508757719955e-05, + "loss": 1.8744, + "step": 651 + }, + { + "epoch": 0.19790560024282897, + "grad_norm": 0.39380425214767456, + "learning_rate": 9.675002531132935e-05, + "loss": 1.9721, + "step": 652 + }, + { + "epoch": 0.1982091364395204, + "grad_norm": 0.3384545147418976, + "learning_rate": 9.674496304545914e-05, + "loss": 2.0122, + "step": 653 + }, + { + "epoch": 0.19851267263621186, + "grad_norm": 0.39647915959358215, + "learning_rate": 9.673990077958894e-05, + "loss": 2.2419, + "step": 654 + }, + { + "epoch": 0.19881620883290332, + "grad_norm": 0.3358941674232483, + "learning_rate": 9.673483851371875e-05, + "loss": 1.8758, + "step": 655 + }, + { + "epoch": 0.19911974502959479, + "grad_norm": 0.3486049771308899, + "learning_rate": 9.672977624784855e-05, + "loss": 1.5762, + "step": 656 + }, + { + "epoch": 0.19942328122628625, + "grad_norm": 2.3050696849823, + "learning_rate": 9.672471398197835e-05, + "loss": 2.0056, + "step": 657 + }, + { + "epoch": 0.19972681742297768, + "grad_norm": 0.35023945569992065, + "learning_rate": 9.671965171610814e-05, + "loss": 1.619, + "step": 658 + }, + { + "epoch": 0.20003035361966914, + "grad_norm": 0.513656735420227, + "learning_rate": 9.671458945023794e-05, + "loss": 1.5269, + "step": 659 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.37498149275779724, + "learning_rate": 9.670952718436773e-05, + "loss": 1.8553, + "step": 660 + }, + { + "epoch": 0.20063742601305207, + "grad_norm": 0.4101942479610443, + "learning_rate": 9.670446491849753e-05, + "loss": 2.1121, + "step": 661 + }, + { + "epoch": 0.2009409622097435, + "grad_norm": 0.4265679717063904, + "learning_rate": 9.669940265262732e-05, + "loss": 2.1863, + "step": 662 + }, + { + "epoch": 0.20124449840643496, + "grad_norm": 4.817168712615967, + "learning_rate": 9.669434038675712e-05, + "loss": 2.0906, + "step": 663 + }, + { + "epoch": 0.20154803460312642, + "grad_norm": 7.518252849578857, + "learning_rate": 9.668927812088691e-05, + "loss": 1.8889, + "step": 664 + }, + { + "epoch": 0.2018515707998179, + "grad_norm": 0.5480749011039734, + "learning_rate": 9.66842158550167e-05, + "loss": 1.8439, + "step": 665 + }, + { + "epoch": 0.20215510699650932, + "grad_norm": 0.3578292429447174, + "learning_rate": 9.667915358914651e-05, + "loss": 1.8742, + "step": 666 + }, + { + "epoch": 0.20245864319320078, + "grad_norm": 0.3799275755882263, + "learning_rate": 9.667409132327631e-05, + "loss": 1.994, + "step": 667 + }, + { + "epoch": 0.20276217938989224, + "grad_norm": 0.3736335039138794, + "learning_rate": 9.66690290574061e-05, + "loss": 1.7933, + "step": 668 + }, + { + "epoch": 0.2030657155865837, + "grad_norm": 0.3145211637020111, + "learning_rate": 9.66639667915359e-05, + "loss": 1.8193, + "step": 669 + }, + { + "epoch": 0.20336925178327517, + "grad_norm": 0.4940774142742157, + "learning_rate": 9.66589045256657e-05, + "loss": 1.9238, + "step": 670 + }, + { + "epoch": 0.2036727879799666, + "grad_norm": 0.431134968996048, + "learning_rate": 9.665384225979549e-05, + "loss": 1.5493, + "step": 671 + }, + { + "epoch": 0.20397632417665806, + "grad_norm": 0.41438859701156616, + "learning_rate": 9.664877999392528e-05, + "loss": 1.2076, + "step": 672 + }, + { + "epoch": 0.20427986037334953, + "grad_norm": 0.38191312551498413, + "learning_rate": 9.664371772805508e-05, + "loss": 1.8201, + "step": 673 + }, + { + "epoch": 0.204583396570041, + "grad_norm": 0.3938577175140381, + "learning_rate": 9.663865546218487e-05, + "loss": 1.5166, + "step": 674 + }, + { + "epoch": 0.20488693276673242, + "grad_norm": 0.46312233805656433, + "learning_rate": 9.663359319631468e-05, + "loss": 1.4652, + "step": 675 + }, + { + "epoch": 0.20519046896342388, + "grad_norm": 0.4087234139442444, + "learning_rate": 9.662853093044448e-05, + "loss": 1.8288, + "step": 676 + }, + { + "epoch": 0.20549400516011535, + "grad_norm": 0.37329304218292236, + "learning_rate": 9.662346866457427e-05, + "loss": 1.9084, + "step": 677 + }, + { + "epoch": 0.2057975413568068, + "grad_norm": 0.37109607458114624, + "learning_rate": 9.661840639870407e-05, + "loss": 1.9674, + "step": 678 + }, + { + "epoch": 0.20610107755349824, + "grad_norm": 0.3936561942100525, + "learning_rate": 9.661334413283386e-05, + "loss": 2.0342, + "step": 679 + }, + { + "epoch": 0.2064046137501897, + "grad_norm": 0.4621008634567261, + "learning_rate": 9.660828186696366e-05, + "loss": 1.5157, + "step": 680 + }, + { + "epoch": 0.20670814994688116, + "grad_norm": 0.3849358558654785, + "learning_rate": 9.660321960109345e-05, + "loss": 2.1513, + "step": 681 + }, + { + "epoch": 0.20701168614357263, + "grad_norm": 0.4873330295085907, + "learning_rate": 9.659815733522325e-05, + "loss": 1.9116, + "step": 682 + }, + { + "epoch": 0.2073152223402641, + "grad_norm": 0.4687885642051697, + "learning_rate": 9.659309506935304e-05, + "loss": 2.278, + "step": 683 + }, + { + "epoch": 0.20761875853695552, + "grad_norm": 0.3966952860355377, + "learning_rate": 9.658803280348285e-05, + "loss": 1.4625, + "step": 684 + }, + { + "epoch": 0.20792229473364698, + "grad_norm": 0.5782402157783508, + "learning_rate": 9.658297053761264e-05, + "loss": 2.2779, + "step": 685 + }, + { + "epoch": 0.20822583093033845, + "grad_norm": 0.37465688586235046, + "learning_rate": 9.657790827174244e-05, + "loss": 1.8462, + "step": 686 + }, + { + "epoch": 0.2085293671270299, + "grad_norm": 0.34408631920814514, + "learning_rate": 9.657284600587223e-05, + "loss": 1.9881, + "step": 687 + }, + { + "epoch": 0.20883290332372134, + "grad_norm": 0.6892307996749878, + "learning_rate": 9.656778374000203e-05, + "loss": 1.9835, + "step": 688 + }, + { + "epoch": 0.2091364395204128, + "grad_norm": 0.3698042631149292, + "learning_rate": 9.656272147413182e-05, + "loss": 2.0665, + "step": 689 + }, + { + "epoch": 0.20943997571710427, + "grad_norm": 0.41265738010406494, + "learning_rate": 9.655765920826162e-05, + "loss": 2.0231, + "step": 690 + }, + { + "epoch": 0.20974351191379573, + "grad_norm": 0.38251030445098877, + "learning_rate": 9.655259694239141e-05, + "loss": 1.7058, + "step": 691 + }, + { + "epoch": 0.2100470481104872, + "grad_norm": 0.468905508518219, + "learning_rate": 9.65475346765212e-05, + "loss": 1.6182, + "step": 692 + }, + { + "epoch": 0.21035058430717862, + "grad_norm": 1.0570484399795532, + "learning_rate": 9.6542472410651e-05, + "loss": 2.0165, + "step": 693 + }, + { + "epoch": 0.21065412050387008, + "grad_norm": 0.3978007435798645, + "learning_rate": 9.653741014478081e-05, + "loss": 1.7859, + "step": 694 + }, + { + "epoch": 0.21095765670056155, + "grad_norm": 0.42616939544677734, + "learning_rate": 9.65323478789106e-05, + "loss": 1.5197, + "step": 695 + }, + { + "epoch": 0.211261192897253, + "grad_norm": 0.39380377531051636, + "learning_rate": 9.65272856130404e-05, + "loss": 1.3796, + "step": 696 + }, + { + "epoch": 0.21156472909394444, + "grad_norm": 0.38581010699272156, + "learning_rate": 9.65222233471702e-05, + "loss": 1.8214, + "step": 697 + }, + { + "epoch": 0.2118682652906359, + "grad_norm": 0.3610150218009949, + "learning_rate": 9.651716108129999e-05, + "loss": 1.897, + "step": 698 + }, + { + "epoch": 0.21217180148732737, + "grad_norm": 0.44913700222969055, + "learning_rate": 9.651209881542978e-05, + "loss": 1.8873, + "step": 699 + }, + { + "epoch": 0.21247533768401883, + "grad_norm": 1.9599745273590088, + "learning_rate": 9.650703654955959e-05, + "loss": 1.946, + "step": 700 + }, + { + "epoch": 0.21277887388071026, + "grad_norm": 1.195716381072998, + "learning_rate": 9.650197428368939e-05, + "loss": 1.8749, + "step": 701 + }, + { + "epoch": 0.21308241007740172, + "grad_norm": 0.3154665231704712, + "learning_rate": 9.649691201781918e-05, + "loss": 1.5924, + "step": 702 + }, + { + "epoch": 0.21338594627409319, + "grad_norm": 0.3550672233104706, + "learning_rate": 9.649184975194898e-05, + "loss": 1.6094, + "step": 703 + }, + { + "epoch": 0.21368948247078465, + "grad_norm": 0.33744126558303833, + "learning_rate": 9.648678748607877e-05, + "loss": 1.3399, + "step": 704 + }, + { + "epoch": 0.2139930186674761, + "grad_norm": 0.33931079506874084, + "learning_rate": 9.648172522020858e-05, + "loss": 2.0096, + "step": 705 + }, + { + "epoch": 0.21429655486416754, + "grad_norm": 0.38951364159584045, + "learning_rate": 9.647666295433837e-05, + "loss": 1.7676, + "step": 706 + }, + { + "epoch": 0.214600091060859, + "grad_norm": 0.408087819814682, + "learning_rate": 9.647160068846817e-05, + "loss": 1.7948, + "step": 707 + }, + { + "epoch": 0.21490362725755047, + "grad_norm": 0.37058812379837036, + "learning_rate": 9.646653842259796e-05, + "loss": 1.9891, + "step": 708 + }, + { + "epoch": 0.21520716345424193, + "grad_norm": 0.4003254473209381, + "learning_rate": 9.646147615672776e-05, + "loss": 1.8895, + "step": 709 + }, + { + "epoch": 0.21551069965093336, + "grad_norm": 0.38838204741477966, + "learning_rate": 9.645641389085755e-05, + "loss": 2.0121, + "step": 710 + }, + { + "epoch": 0.21581423584762482, + "grad_norm": 0.41912707686424255, + "learning_rate": 9.645135162498735e-05, + "loss": 1.9804, + "step": 711 + }, + { + "epoch": 0.2161177720443163, + "grad_norm": 0.353454053401947, + "learning_rate": 9.644628935911714e-05, + "loss": 2.0478, + "step": 712 + }, + { + "epoch": 0.21642130824100775, + "grad_norm": 0.3825720548629761, + "learning_rate": 9.644122709324694e-05, + "loss": 1.6676, + "step": 713 + }, + { + "epoch": 0.21672484443769918, + "grad_norm": 0.4197389781475067, + "learning_rate": 9.643616482737675e-05, + "loss": 1.9732, + "step": 714 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.4452435076236725, + "learning_rate": 9.643110256150654e-05, + "loss": 2.0918, + "step": 715 + }, + { + "epoch": 0.2173319168310821, + "grad_norm": 0.3366299271583557, + "learning_rate": 9.642604029563634e-05, + "loss": 1.7469, + "step": 716 + }, + { + "epoch": 0.21763545302777357, + "grad_norm": 0.31280553340911865, + "learning_rate": 9.642097802976613e-05, + "loss": 2.0348, + "step": 717 + }, + { + "epoch": 0.21793898922446503, + "grad_norm": 0.425503671169281, + "learning_rate": 9.641591576389593e-05, + "loss": 1.3629, + "step": 718 + }, + { + "epoch": 0.21824252542115646, + "grad_norm": 0.3986441493034363, + "learning_rate": 9.641085349802572e-05, + "loss": 1.4703, + "step": 719 + }, + { + "epoch": 0.21854606161784793, + "grad_norm": 0.34377026557922363, + "learning_rate": 9.640579123215552e-05, + "loss": 1.9788, + "step": 720 + }, + { + "epoch": 0.2188495978145394, + "grad_norm": 0.3445621430873871, + "learning_rate": 9.640072896628531e-05, + "loss": 1.9137, + "step": 721 + }, + { + "epoch": 0.21915313401123085, + "grad_norm": 0.40363574028015137, + "learning_rate": 9.63956667004151e-05, + "loss": 1.8911, + "step": 722 + }, + { + "epoch": 0.21945667020792228, + "grad_norm": 0.36166059970855713, + "learning_rate": 9.639060443454491e-05, + "loss": 1.9176, + "step": 723 + }, + { + "epoch": 0.21976020640461374, + "grad_norm": 0.7732321619987488, + "learning_rate": 9.638554216867471e-05, + "loss": 2.1942, + "step": 724 + }, + { + "epoch": 0.2200637426013052, + "grad_norm": 0.4042604863643646, + "learning_rate": 9.63804799028045e-05, + "loss": 1.8964, + "step": 725 + }, + { + "epoch": 0.22036727879799667, + "grad_norm": 0.3888862133026123, + "learning_rate": 9.63754176369343e-05, + "loss": 1.716, + "step": 726 + }, + { + "epoch": 0.22067081499468813, + "grad_norm": 0.32185250520706177, + "learning_rate": 9.637035537106409e-05, + "loss": 2.1227, + "step": 727 + }, + { + "epoch": 0.22097435119137956, + "grad_norm": 0.36421746015548706, + "learning_rate": 9.636529310519389e-05, + "loss": 1.3262, + "step": 728 + }, + { + "epoch": 0.22127788738807103, + "grad_norm": 0.42780765891075134, + "learning_rate": 9.636023083932368e-05, + "loss": 1.806, + "step": 729 + }, + { + "epoch": 0.2215814235847625, + "grad_norm": 0.3754510283470154, + "learning_rate": 9.635516857345348e-05, + "loss": 1.9286, + "step": 730 + }, + { + "epoch": 0.22188495978145395, + "grad_norm": 0.35199174284935, + "learning_rate": 9.635010630758327e-05, + "loss": 1.9703, + "step": 731 + }, + { + "epoch": 0.22218849597814538, + "grad_norm": 0.36272746324539185, + "learning_rate": 9.634504404171307e-05, + "loss": 1.7773, + "step": 732 + }, + { + "epoch": 0.22249203217483685, + "grad_norm": 0.4233802556991577, + "learning_rate": 9.633998177584287e-05, + "loss": 2.0016, + "step": 733 + }, + { + "epoch": 0.2227955683715283, + "grad_norm": 0.46138089895248413, + "learning_rate": 9.633491950997267e-05, + "loss": 1.764, + "step": 734 + }, + { + "epoch": 0.22309910456821977, + "grad_norm": 0.37863031029701233, + "learning_rate": 9.632985724410246e-05, + "loss": 1.6493, + "step": 735 + }, + { + "epoch": 0.2234026407649112, + "grad_norm": 0.4493837356567383, + "learning_rate": 9.632479497823226e-05, + "loss": 2.04, + "step": 736 + }, + { + "epoch": 0.22370617696160267, + "grad_norm": 0.581119179725647, + "learning_rate": 9.631973271236205e-05, + "loss": 1.777, + "step": 737 + }, + { + "epoch": 0.22400971315829413, + "grad_norm": 0.3730584979057312, + "learning_rate": 9.631467044649185e-05, + "loss": 1.8932, + "step": 738 + }, + { + "epoch": 0.2243132493549856, + "grad_norm": 0.351421594619751, + "learning_rate": 9.630960818062164e-05, + "loss": 2.3182, + "step": 739 + }, + { + "epoch": 0.22461678555167705, + "grad_norm": 0.4237976670265198, + "learning_rate": 9.630454591475144e-05, + "loss": 2.1315, + "step": 740 + }, + { + "epoch": 0.22492032174836848, + "grad_norm": 0.38544562458992004, + "learning_rate": 9.629948364888123e-05, + "loss": 1.9596, + "step": 741 + }, + { + "epoch": 0.22522385794505995, + "grad_norm": 0.407672256231308, + "learning_rate": 9.629442138301104e-05, + "loss": 1.8694, + "step": 742 + }, + { + "epoch": 0.2255273941417514, + "grad_norm": 0.4415782690048218, + "learning_rate": 9.628935911714084e-05, + "loss": 1.8658, + "step": 743 + }, + { + "epoch": 0.22583093033844287, + "grad_norm": 0.41300657391548157, + "learning_rate": 9.628429685127063e-05, + "loss": 2.0477, + "step": 744 + }, + { + "epoch": 0.2261344665351343, + "grad_norm": 0.36000654101371765, + "learning_rate": 9.627923458540044e-05, + "loss": 1.9045, + "step": 745 + }, + { + "epoch": 0.22643800273182577, + "grad_norm": 0.42653003334999084, + "learning_rate": 9.627417231953023e-05, + "loss": 1.2151, + "step": 746 + }, + { + "epoch": 0.22674153892851723, + "grad_norm": 0.4157649874687195, + "learning_rate": 9.626911005366003e-05, + "loss": 1.9335, + "step": 747 + }, + { + "epoch": 0.2270450751252087, + "grad_norm": 0.3805077373981476, + "learning_rate": 9.626404778778982e-05, + "loss": 2.0803, + "step": 748 + }, + { + "epoch": 0.22734861132190012, + "grad_norm": 0.39710867404937744, + "learning_rate": 9.625898552191962e-05, + "loss": 2.2628, + "step": 749 + }, + { + "epoch": 0.22765214751859159, + "grad_norm": 0.4012609124183655, + "learning_rate": 9.625392325604941e-05, + "loss": 1.9586, + "step": 750 + }, + { + "epoch": 0.22795568371528305, + "grad_norm": 0.9281008243560791, + "learning_rate": 9.624886099017921e-05, + "loss": 1.168, + "step": 751 + }, + { + "epoch": 0.2282592199119745, + "grad_norm": 0.36847764253616333, + "learning_rate": 9.6243798724309e-05, + "loss": 1.8907, + "step": 752 + }, + { + "epoch": 0.22856275610866597, + "grad_norm": 0.4531751573085785, + "learning_rate": 9.623873645843881e-05, + "loss": 1.4511, + "step": 753 + }, + { + "epoch": 0.2288662923053574, + "grad_norm": 0.36623820662498474, + "learning_rate": 9.62336741925686e-05, + "loss": 1.6707, + "step": 754 + }, + { + "epoch": 0.22916982850204887, + "grad_norm": 0.3104342222213745, + "learning_rate": 9.62286119266984e-05, + "loss": 1.988, + "step": 755 + }, + { + "epoch": 0.22947336469874033, + "grad_norm": 0.3790084421634674, + "learning_rate": 9.62235496608282e-05, + "loss": 1.979, + "step": 756 + }, + { + "epoch": 0.2297769008954318, + "grad_norm": 0.3642970323562622, + "learning_rate": 9.621848739495799e-05, + "loss": 1.9998, + "step": 757 + }, + { + "epoch": 0.23008043709212322, + "grad_norm": 0.34588292241096497, + "learning_rate": 9.621342512908779e-05, + "loss": 2.0511, + "step": 758 + }, + { + "epoch": 0.2303839732888147, + "grad_norm": 0.3556496798992157, + "learning_rate": 9.620836286321758e-05, + "loss": 1.8785, + "step": 759 + }, + { + "epoch": 0.23068750948550615, + "grad_norm": 0.4669034779071808, + "learning_rate": 9.620330059734737e-05, + "loss": 1.5027, + "step": 760 + }, + { + "epoch": 0.2309910456821976, + "grad_norm": 0.39685994386672974, + "learning_rate": 9.619823833147717e-05, + "loss": 2.1644, + "step": 761 + }, + { + "epoch": 0.23129458187888904, + "grad_norm": 0.39183005690574646, + "learning_rate": 9.619317606560698e-05, + "loss": 1.9615, + "step": 762 + }, + { + "epoch": 0.2315981180755805, + "grad_norm": 0.36401331424713135, + "learning_rate": 9.618811379973677e-05, + "loss": 1.7535, + "step": 763 + }, + { + "epoch": 0.23190165427227197, + "grad_norm": 0.43118295073509216, + "learning_rate": 9.618305153386657e-05, + "loss": 1.884, + "step": 764 + }, + { + "epoch": 0.23220519046896343, + "grad_norm": 0.5061665177345276, + "learning_rate": 9.617798926799636e-05, + "loss": 2.0051, + "step": 765 + }, + { + "epoch": 0.2325087266656549, + "grad_norm": 0.4487472474575043, + "learning_rate": 9.617292700212616e-05, + "loss": 1.6831, + "step": 766 + }, + { + "epoch": 0.23281226286234633, + "grad_norm": 0.3660997450351715, + "learning_rate": 9.616786473625595e-05, + "loss": 1.9276, + "step": 767 + }, + { + "epoch": 0.2331157990590378, + "grad_norm": 0.3823026716709137, + "learning_rate": 9.616280247038575e-05, + "loss": 1.9817, + "step": 768 + }, + { + "epoch": 0.23341933525572925, + "grad_norm": 0.32568395137786865, + "learning_rate": 9.615774020451554e-05, + "loss": 1.508, + "step": 769 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.34985265135765076, + "learning_rate": 9.615267793864534e-05, + "loss": 1.6793, + "step": 770 + }, + { + "epoch": 0.23402640764911214, + "grad_norm": 0.38563957810401917, + "learning_rate": 9.614761567277513e-05, + "loss": 1.588, + "step": 771 + }, + { + "epoch": 0.2343299438458036, + "grad_norm": 0.33572301268577576, + "learning_rate": 9.614255340690494e-05, + "loss": 1.9541, + "step": 772 + }, + { + "epoch": 0.23463348004249507, + "grad_norm": 0.33936449885368347, + "learning_rate": 9.613749114103473e-05, + "loss": 1.9311, + "step": 773 + }, + { + "epoch": 0.23493701623918653, + "grad_norm": 0.34984657168388367, + "learning_rate": 9.613242887516453e-05, + "loss": 1.9532, + "step": 774 + }, + { + "epoch": 0.235240552435878, + "grad_norm": 0.3651373088359833, + "learning_rate": 9.612736660929432e-05, + "loss": 1.8815, + "step": 775 + }, + { + "epoch": 0.23554408863256943, + "grad_norm": 0.4317852854728699, + "learning_rate": 9.612230434342412e-05, + "loss": 2.0262, + "step": 776 + }, + { + "epoch": 0.2358476248292609, + "grad_norm": 0.375522255897522, + "learning_rate": 9.611724207755391e-05, + "loss": 1.9964, + "step": 777 + }, + { + "epoch": 0.23615116102595235, + "grad_norm": 0.37290844321250916, + "learning_rate": 9.611217981168371e-05, + "loss": 1.7456, + "step": 778 + }, + { + "epoch": 0.2364546972226438, + "grad_norm": 0.3768545985221863, + "learning_rate": 9.61071175458135e-05, + "loss": 1.9591, + "step": 779 + }, + { + "epoch": 0.23675823341933525, + "grad_norm": 0.3147246837615967, + "learning_rate": 9.61020552799433e-05, + "loss": 1.4033, + "step": 780 + }, + { + "epoch": 0.2370617696160267, + "grad_norm": 0.4480874240398407, + "learning_rate": 9.60969930140731e-05, + "loss": 1.9598, + "step": 781 + }, + { + "epoch": 0.23736530581271817, + "grad_norm": 0.7287562489509583, + "learning_rate": 9.60919307482029e-05, + "loss": 2.0097, + "step": 782 + }, + { + "epoch": 0.23766884200940963, + "grad_norm": 0.36199334263801575, + "learning_rate": 9.60868684823327e-05, + "loss": 1.8089, + "step": 783 + }, + { + "epoch": 0.23797237820610107, + "grad_norm": 0.32855263352394104, + "learning_rate": 9.608180621646249e-05, + "loss": 2.0199, + "step": 784 + }, + { + "epoch": 0.23827591440279253, + "grad_norm": 0.37182894349098206, + "learning_rate": 9.607674395059229e-05, + "loss": 1.7253, + "step": 785 + }, + { + "epoch": 0.238579450599484, + "grad_norm": 0.3365595042705536, + "learning_rate": 9.607168168472208e-05, + "loss": 1.9308, + "step": 786 + }, + { + "epoch": 0.23888298679617545, + "grad_norm": 0.400685578584671, + "learning_rate": 9.606661941885187e-05, + "loss": 1.8939, + "step": 787 + }, + { + "epoch": 0.2391865229928669, + "grad_norm": 0.6354159116744995, + "learning_rate": 9.606155715298167e-05, + "loss": 2.1476, + "step": 788 + }, + { + "epoch": 0.23949005918955835, + "grad_norm": 0.4196738600730896, + "learning_rate": 9.605649488711148e-05, + "loss": 1.8457, + "step": 789 + }, + { + "epoch": 0.2397935953862498, + "grad_norm": 0.35839545726776123, + "learning_rate": 9.605143262124127e-05, + "loss": 1.824, + "step": 790 + }, + { + "epoch": 0.24009713158294127, + "grad_norm": 0.3597940504550934, + "learning_rate": 9.604637035537107e-05, + "loss": 1.9583, + "step": 791 + }, + { + "epoch": 0.24040066777963273, + "grad_norm": 0.5783160924911499, + "learning_rate": 9.604130808950088e-05, + "loss": 2.2, + "step": 792 + }, + { + "epoch": 0.24070420397632417, + "grad_norm": 0.3544808030128479, + "learning_rate": 9.603624582363067e-05, + "loss": 2.1092, + "step": 793 + }, + { + "epoch": 0.24100774017301563, + "grad_norm": 0.41170623898506165, + "learning_rate": 9.603118355776047e-05, + "loss": 1.6004, + "step": 794 + }, + { + "epoch": 0.2413112763697071, + "grad_norm": 0.3832992613315582, + "learning_rate": 9.602612129189026e-05, + "loss": 1.4981, + "step": 795 + }, + { + "epoch": 0.24161481256639855, + "grad_norm": 0.5239993333816528, + "learning_rate": 9.602105902602006e-05, + "loss": 1.6026, + "step": 796 + }, + { + "epoch": 0.24191834876308999, + "grad_norm": 0.38445138931274414, + "learning_rate": 9.601599676014985e-05, + "loss": 1.5765, + "step": 797 + }, + { + "epoch": 0.24222188495978145, + "grad_norm": 0.38520511984825134, + "learning_rate": 9.601093449427964e-05, + "loss": 2.1069, + "step": 798 + }, + { + "epoch": 0.2425254211564729, + "grad_norm": 0.3519560694694519, + "learning_rate": 9.600587222840944e-05, + "loss": 1.8896, + "step": 799 + }, + { + "epoch": 0.24282895735316437, + "grad_norm": 0.5392457246780396, + "learning_rate": 9.600080996253923e-05, + "loss": 1.6273, + "step": 800 + }, + { + "epoch": 0.24313249354985583, + "grad_norm": 0.4213111996650696, + "learning_rate": 9.599574769666904e-05, + "loss": 1.489, + "step": 801 + }, + { + "epoch": 0.24343602974654727, + "grad_norm": 0.4006531834602356, + "learning_rate": 9.599068543079884e-05, + "loss": 1.9842, + "step": 802 + }, + { + "epoch": 0.24373956594323873, + "grad_norm": 0.3792324364185333, + "learning_rate": 9.598562316492863e-05, + "loss": 1.727, + "step": 803 + }, + { + "epoch": 0.2440431021399302, + "grad_norm": 0.3555270731449127, + "learning_rate": 9.598056089905843e-05, + "loss": 1.68, + "step": 804 + }, + { + "epoch": 0.24434663833662165, + "grad_norm": 0.33837342262268066, + "learning_rate": 9.597549863318822e-05, + "loss": 2.0709, + "step": 805 + }, + { + "epoch": 0.2446501745333131, + "grad_norm": 0.3812510371208191, + "learning_rate": 9.597043636731802e-05, + "loss": 2.1211, + "step": 806 + }, + { + "epoch": 0.24495371073000455, + "grad_norm": 0.33870792388916016, + "learning_rate": 9.596537410144781e-05, + "loss": 2.1047, + "step": 807 + }, + { + "epoch": 0.245257246926696, + "grad_norm": 0.3948252201080322, + "learning_rate": 9.59603118355776e-05, + "loss": 1.7553, + "step": 808 + }, + { + "epoch": 0.24556078312338747, + "grad_norm": 0.39410725235939026, + "learning_rate": 9.59552495697074e-05, + "loss": 1.9383, + "step": 809 + }, + { + "epoch": 0.2458643193200789, + "grad_norm": 0.37794989347457886, + "learning_rate": 9.59501873038372e-05, + "loss": 1.9115, + "step": 810 + }, + { + "epoch": 0.24616785551677037, + "grad_norm": 1.6270610094070435, + "learning_rate": 9.5945125037967e-05, + "loss": 1.8472, + "step": 811 + }, + { + "epoch": 0.24647139171346183, + "grad_norm": 0.3724587559700012, + "learning_rate": 9.59400627720968e-05, + "loss": 1.9087, + "step": 812 + }, + { + "epoch": 0.2467749279101533, + "grad_norm": 0.4097403585910797, + "learning_rate": 9.59350005062266e-05, + "loss": 1.8325, + "step": 813 + }, + { + "epoch": 0.24707846410684475, + "grad_norm": 0.4052940905094147, + "learning_rate": 9.592993824035639e-05, + "loss": 2.0241, + "step": 814 + }, + { + "epoch": 0.2473820003035362, + "grad_norm": 0.3887682557106018, + "learning_rate": 9.592487597448618e-05, + "loss": 1.6114, + "step": 815 + }, + { + "epoch": 0.24768553650022765, + "grad_norm": 0.404450386762619, + "learning_rate": 9.591981370861598e-05, + "loss": 1.8384, + "step": 816 + }, + { + "epoch": 0.2479890726969191, + "grad_norm": 0.7955893874168396, + "learning_rate": 9.591475144274577e-05, + "loss": 2.2149, + "step": 817 + }, + { + "epoch": 0.24829260889361057, + "grad_norm": 4.355859279632568, + "learning_rate": 9.590968917687557e-05, + "loss": 2.3753, + "step": 818 + }, + { + "epoch": 0.248596145090302, + "grad_norm": 0.3698444962501526, + "learning_rate": 9.590462691100536e-05, + "loss": 1.7354, + "step": 819 + }, + { + "epoch": 0.24889968128699347, + "grad_norm": 0.3658899962902069, + "learning_rate": 9.589956464513517e-05, + "loss": 1.7803, + "step": 820 + }, + { + "epoch": 0.24920321748368493, + "grad_norm": 0.405072957277298, + "learning_rate": 9.589450237926497e-05, + "loss": 1.7684, + "step": 821 + }, + { + "epoch": 0.2495067536803764, + "grad_norm": 0.7590973973274231, + "learning_rate": 9.588944011339476e-05, + "loss": 1.9466, + "step": 822 + }, + { + "epoch": 0.24981028987706785, + "grad_norm": 0.5217581987380981, + "learning_rate": 9.588437784752456e-05, + "loss": 2.1281, + "step": 823 + }, + { + "epoch": 0.2501138260737593, + "grad_norm": 0.3716435134410858, + "learning_rate": 9.587931558165435e-05, + "loss": 2.114, + "step": 824 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.44017624855041504, + "learning_rate": 9.587425331578414e-05, + "loss": 2.0445, + "step": 825 + }, + { + "epoch": 0.2507208984671422, + "grad_norm": 0.370370090007782, + "learning_rate": 9.586919104991394e-05, + "loss": 1.8674, + "step": 826 + }, + { + "epoch": 0.2510244346638337, + "grad_norm": 0.32125499844551086, + "learning_rate": 9.586412878404373e-05, + "loss": 1.4129, + "step": 827 + }, + { + "epoch": 0.2513279708605251, + "grad_norm": 0.4143073856830597, + "learning_rate": 9.585906651817353e-05, + "loss": 1.9895, + "step": 828 + }, + { + "epoch": 0.2516315070572166, + "grad_norm": 0.3492576777935028, + "learning_rate": 9.585400425230334e-05, + "loss": 2.0669, + "step": 829 + }, + { + "epoch": 0.25193504325390803, + "grad_norm": 0.4044751524925232, + "learning_rate": 9.584894198643313e-05, + "loss": 1.5909, + "step": 830 + }, + { + "epoch": 0.25223857945059946, + "grad_norm": 0.3410158157348633, + "learning_rate": 9.584387972056293e-05, + "loss": 1.7485, + "step": 831 + }, + { + "epoch": 0.25254211564729095, + "grad_norm": 0.340320348739624, + "learning_rate": 9.583881745469272e-05, + "loss": 1.8897, + "step": 832 + }, + { + "epoch": 0.2528456518439824, + "grad_norm": 0.35516276955604553, + "learning_rate": 9.583375518882252e-05, + "loss": 1.6332, + "step": 833 + }, + { + "epoch": 0.2531491880406738, + "grad_norm": 0.4099842309951782, + "learning_rate": 9.582869292295232e-05, + "loss": 1.5617, + "step": 834 + }, + { + "epoch": 0.2534527242373653, + "grad_norm": 0.38086098432540894, + "learning_rate": 9.582363065708212e-05, + "loss": 2.0837, + "step": 835 + }, + { + "epoch": 0.25375626043405675, + "grad_norm": 0.8040663003921509, + "learning_rate": 9.581856839121191e-05, + "loss": 1.8587, + "step": 836 + }, + { + "epoch": 0.25405979663074824, + "grad_norm": 0.41297683119773865, + "learning_rate": 9.581350612534171e-05, + "loss": 1.9602, + "step": 837 + }, + { + "epoch": 0.25436333282743967, + "grad_norm": 0.38155442476272583, + "learning_rate": 9.58084438594715e-05, + "loss": 1.375, + "step": 838 + }, + { + "epoch": 0.2546668690241311, + "grad_norm": 0.3956829905509949, + "learning_rate": 9.58033815936013e-05, + "loss": 1.9617, + "step": 839 + }, + { + "epoch": 0.2549704052208226, + "grad_norm": 0.38675928115844727, + "learning_rate": 9.579831932773111e-05, + "loss": 1.8186, + "step": 840 + }, + { + "epoch": 0.255273941417514, + "grad_norm": 0.33989018201828003, + "learning_rate": 9.57932570618609e-05, + "loss": 2.1734, + "step": 841 + }, + { + "epoch": 0.2555774776142055, + "grad_norm": 0.3240448534488678, + "learning_rate": 9.57881947959907e-05, + "loss": 1.6238, + "step": 842 + }, + { + "epoch": 0.25588101381089695, + "grad_norm": 0.6117075681686401, + "learning_rate": 9.578313253012049e-05, + "loss": 1.986, + "step": 843 + }, + { + "epoch": 0.2561845500075884, + "grad_norm": 0.3781290650367737, + "learning_rate": 9.577807026425029e-05, + "loss": 2.0021, + "step": 844 + }, + { + "epoch": 0.2564880862042799, + "grad_norm": 0.4373374879360199, + "learning_rate": 9.577300799838008e-05, + "loss": 2.0195, + "step": 845 + }, + { + "epoch": 0.2567916224009713, + "grad_norm": 0.4125923216342926, + "learning_rate": 9.576794573250988e-05, + "loss": 1.9412, + "step": 846 + }, + { + "epoch": 0.2570951585976628, + "grad_norm": 0.3557007908821106, + "learning_rate": 9.576288346663967e-05, + "loss": 1.8098, + "step": 847 + }, + { + "epoch": 0.25739869479435423, + "grad_norm": 0.49475541710853577, + "learning_rate": 9.575782120076947e-05, + "loss": 1.5756, + "step": 848 + }, + { + "epoch": 0.25770223099104567, + "grad_norm": 0.3507518768310547, + "learning_rate": 9.575275893489926e-05, + "loss": 1.6413, + "step": 849 + }, + { + "epoch": 0.25800576718773716, + "grad_norm": 0.39508333802223206, + "learning_rate": 9.574769666902907e-05, + "loss": 1.9777, + "step": 850 + }, + { + "epoch": 0.2583093033844286, + "grad_norm": 0.328807532787323, + "learning_rate": 9.574263440315886e-05, + "loss": 1.4948, + "step": 851 + }, + { + "epoch": 0.25861283958112, + "grad_norm": 0.3154551386833191, + "learning_rate": 9.573757213728866e-05, + "loss": 1.7809, + "step": 852 + }, + { + "epoch": 0.2589163757778115, + "grad_norm": 0.502554178237915, + "learning_rate": 9.573250987141845e-05, + "loss": 1.4369, + "step": 853 + }, + { + "epoch": 0.25921991197450295, + "grad_norm": 0.4416670799255371, + "learning_rate": 9.572744760554825e-05, + "loss": 1.7364, + "step": 854 + }, + { + "epoch": 0.25952344817119444, + "grad_norm": 0.43228060007095337, + "learning_rate": 9.572238533967804e-05, + "loss": 1.3281, + "step": 855 + }, + { + "epoch": 0.25982698436788587, + "grad_norm": 0.3714723289012909, + "learning_rate": 9.571732307380784e-05, + "loss": 2.0893, + "step": 856 + }, + { + "epoch": 0.2601305205645773, + "grad_norm": 0.3309679925441742, + "learning_rate": 9.571226080793763e-05, + "loss": 1.7982, + "step": 857 + }, + { + "epoch": 0.2604340567612688, + "grad_norm": 0.3709767460823059, + "learning_rate": 9.570719854206743e-05, + "loss": 1.8628, + "step": 858 + }, + { + "epoch": 0.26073759295796023, + "grad_norm": 0.6020816564559937, + "learning_rate": 9.570213627619724e-05, + "loss": 2.0077, + "step": 859 + }, + { + "epoch": 0.2610411291546517, + "grad_norm": 0.30620431900024414, + "learning_rate": 9.569707401032703e-05, + "loss": 1.8834, + "step": 860 + }, + { + "epoch": 0.26134466535134315, + "grad_norm": 0.41518962383270264, + "learning_rate": 9.569201174445683e-05, + "loss": 1.8025, + "step": 861 + }, + { + "epoch": 0.2616482015480346, + "grad_norm": 0.3919786512851715, + "learning_rate": 9.568694947858662e-05, + "loss": 1.995, + "step": 862 + }, + { + "epoch": 0.2619517377447261, + "grad_norm": 0.47429168224334717, + "learning_rate": 9.568188721271641e-05, + "loss": 1.9423, + "step": 863 + }, + { + "epoch": 0.2622552739414175, + "grad_norm": 0.8941421508789062, + "learning_rate": 9.567682494684621e-05, + "loss": 1.5046, + "step": 864 + }, + { + "epoch": 0.26255881013810894, + "grad_norm": 0.4357859194278717, + "learning_rate": 9.5671762680976e-05, + "loss": 2.0023, + "step": 865 + }, + { + "epoch": 0.26286234633480043, + "grad_norm": 0.3873944878578186, + "learning_rate": 9.56667004151058e-05, + "loss": 2.0607, + "step": 866 + }, + { + "epoch": 0.26316588253149187, + "grad_norm": 0.4355853497982025, + "learning_rate": 9.56616381492356e-05, + "loss": 1.8254, + "step": 867 + }, + { + "epoch": 0.26346941872818336, + "grad_norm": 0.3882213234901428, + "learning_rate": 9.56565758833654e-05, + "loss": 1.7809, + "step": 868 + }, + { + "epoch": 0.2637729549248748, + "grad_norm": 0.4021656811237335, + "learning_rate": 9.56515136174952e-05, + "loss": 2.1321, + "step": 869 + }, + { + "epoch": 0.2640764911215662, + "grad_norm": 0.43587526679039, + "learning_rate": 9.564645135162499e-05, + "loss": 1.7865, + "step": 870 + }, + { + "epoch": 0.2643800273182577, + "grad_norm": 0.364045649766922, + "learning_rate": 9.564138908575479e-05, + "loss": 1.8173, + "step": 871 + }, + { + "epoch": 0.26468356351494915, + "grad_norm": 0.3956625461578369, + "learning_rate": 9.563632681988458e-05, + "loss": 1.4822, + "step": 872 + }, + { + "epoch": 0.26498709971164064, + "grad_norm": 0.40755051374435425, + "learning_rate": 9.563126455401438e-05, + "loss": 1.9418, + "step": 873 + }, + { + "epoch": 0.2652906359083321, + "grad_norm": 0.39405086636543274, + "learning_rate": 9.562620228814417e-05, + "loss": 1.4529, + "step": 874 + }, + { + "epoch": 0.2655941721050235, + "grad_norm": 0.4400351047515869, + "learning_rate": 9.562114002227397e-05, + "loss": 2.1095, + "step": 875 + }, + { + "epoch": 0.265897708301715, + "grad_norm": 0.40135496854782104, + "learning_rate": 9.561607775640376e-05, + "loss": 1.9462, + "step": 876 + }, + { + "epoch": 0.26620124449840643, + "grad_norm": 0.5949604511260986, + "learning_rate": 9.561101549053356e-05, + "loss": 1.8797, + "step": 877 + }, + { + "epoch": 0.26650478069509786, + "grad_norm": 0.38301005959510803, + "learning_rate": 9.560595322466336e-05, + "loss": 2.0887, + "step": 878 + }, + { + "epoch": 0.26680831689178935, + "grad_norm": 0.6215627789497375, + "learning_rate": 9.560089095879317e-05, + "loss": 1.7846, + "step": 879 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.4041058123111725, + "learning_rate": 9.559582869292297e-05, + "loss": 1.5127, + "step": 880 + }, + { + "epoch": 0.2674153892851723, + "grad_norm": 0.30281975865364075, + "learning_rate": 9.559076642705276e-05, + "loss": 1.8487, + "step": 881 + }, + { + "epoch": 0.2677189254818637, + "grad_norm": 0.34536200761795044, + "learning_rate": 9.558570416118256e-05, + "loss": 1.8976, + "step": 882 + }, + { + "epoch": 0.26802246167855515, + "grad_norm": 0.367245614528656, + "learning_rate": 9.558064189531235e-05, + "loss": 1.9804, + "step": 883 + }, + { + "epoch": 0.26832599787524664, + "grad_norm": 0.41750359535217285, + "learning_rate": 9.557557962944215e-05, + "loss": 1.5932, + "step": 884 + }, + { + "epoch": 0.26862953407193807, + "grad_norm": 0.7777047157287598, + "learning_rate": 9.557051736357194e-05, + "loss": 1.8513, + "step": 885 + }, + { + "epoch": 0.26893307026862956, + "grad_norm": 0.3720252215862274, + "learning_rate": 9.556545509770174e-05, + "loss": 2.1819, + "step": 886 + }, + { + "epoch": 0.269236606465321, + "grad_norm": 0.7321712970733643, + "learning_rate": 9.556039283183153e-05, + "loss": 1.4653, + "step": 887 + }, + { + "epoch": 0.2695401426620124, + "grad_norm": 0.4140429198741913, + "learning_rate": 9.555533056596133e-05, + "loss": 1.9816, + "step": 888 + }, + { + "epoch": 0.2698436788587039, + "grad_norm": 0.40684935450553894, + "learning_rate": 9.555026830009113e-05, + "loss": 1.5866, + "step": 889 + }, + { + "epoch": 0.27014721505539535, + "grad_norm": 0.4067225754261017, + "learning_rate": 9.554520603422093e-05, + "loss": 1.5951, + "step": 890 + }, + { + "epoch": 0.2704507512520868, + "grad_norm": 0.34240391850471497, + "learning_rate": 9.554014376835072e-05, + "loss": 1.9076, + "step": 891 + }, + { + "epoch": 0.2707542874487783, + "grad_norm": 0.4634522795677185, + "learning_rate": 9.553508150248052e-05, + "loss": 1.9856, + "step": 892 + }, + { + "epoch": 0.2710578236454697, + "grad_norm": 0.408015638589859, + "learning_rate": 9.553001923661031e-05, + "loss": 1.7997, + "step": 893 + }, + { + "epoch": 0.2713613598421612, + "grad_norm": 0.3894648253917694, + "learning_rate": 9.552495697074011e-05, + "loss": 1.8381, + "step": 894 + }, + { + "epoch": 0.27166489603885263, + "grad_norm": 0.37494730949401855, + "learning_rate": 9.55198947048699e-05, + "loss": 2.0548, + "step": 895 + }, + { + "epoch": 0.27196843223554407, + "grad_norm": 0.39796411991119385, + "learning_rate": 9.55148324389997e-05, + "loss": 1.9272, + "step": 896 + }, + { + "epoch": 0.27227196843223556, + "grad_norm": 0.40153494477272034, + "learning_rate": 9.550977017312949e-05, + "loss": 1.7136, + "step": 897 + }, + { + "epoch": 0.272575504628927, + "grad_norm": 0.39771386981010437, + "learning_rate": 9.55047079072593e-05, + "loss": 2.1017, + "step": 898 + }, + { + "epoch": 0.2728790408256185, + "grad_norm": 0.4085974097251892, + "learning_rate": 9.54996456413891e-05, + "loss": 1.3951, + "step": 899 + }, + { + "epoch": 0.2731825770223099, + "grad_norm": 0.39849239587783813, + "learning_rate": 9.549458337551889e-05, + "loss": 1.9988, + "step": 900 + }, + { + "epoch": 0.27348611321900135, + "grad_norm": 0.38662001490592957, + "learning_rate": 9.548952110964868e-05, + "loss": 1.8491, + "step": 901 + }, + { + "epoch": 0.27378964941569284, + "grad_norm": 0.38078710436820984, + "learning_rate": 9.548445884377848e-05, + "loss": 1.9, + "step": 902 + }, + { + "epoch": 0.27409318561238427, + "grad_norm": 0.3548724949359894, + "learning_rate": 9.547939657790827e-05, + "loss": 1.8754, + "step": 903 + }, + { + "epoch": 0.2743967218090757, + "grad_norm": 0.37712323665618896, + "learning_rate": 9.547433431203807e-05, + "loss": 1.5497, + "step": 904 + }, + { + "epoch": 0.2747002580057672, + "grad_norm": 0.4060449004173279, + "learning_rate": 9.546927204616786e-05, + "loss": 1.7231, + "step": 905 + }, + { + "epoch": 0.27500379420245863, + "grad_norm": 0.42080479860305786, + "learning_rate": 9.546420978029766e-05, + "loss": 2.1538, + "step": 906 + }, + { + "epoch": 0.2753073303991501, + "grad_norm": 0.4034046232700348, + "learning_rate": 9.545914751442747e-05, + "loss": 1.7335, + "step": 907 + }, + { + "epoch": 0.27561086659584155, + "grad_norm": 0.3676345646381378, + "learning_rate": 9.545408524855726e-05, + "loss": 1.6193, + "step": 908 + }, + { + "epoch": 0.275914402792533, + "grad_norm": 0.3349851965904236, + "learning_rate": 9.544902298268706e-05, + "loss": 1.8997, + "step": 909 + }, + { + "epoch": 0.2762179389892245, + "grad_norm": 0.3676302134990692, + "learning_rate": 9.544396071681685e-05, + "loss": 1.4031, + "step": 910 + }, + { + "epoch": 0.2765214751859159, + "grad_norm": 0.36593666672706604, + "learning_rate": 9.543889845094665e-05, + "loss": 1.8838, + "step": 911 + }, + { + "epoch": 0.2768250113826074, + "grad_norm": 0.3793712258338928, + "learning_rate": 9.543383618507644e-05, + "loss": 1.5949, + "step": 912 + }, + { + "epoch": 0.27712854757929883, + "grad_norm": 0.47586631774902344, + "learning_rate": 9.542877391920624e-05, + "loss": 1.5687, + "step": 913 + }, + { + "epoch": 0.27743208377599027, + "grad_norm": 0.38850024342536926, + "learning_rate": 9.542371165333603e-05, + "loss": 1.7336, + "step": 914 + }, + { + "epoch": 0.27773561997268176, + "grad_norm": 0.4039680063724518, + "learning_rate": 9.541864938746583e-05, + "loss": 2.0476, + "step": 915 + }, + { + "epoch": 0.2780391561693732, + "grad_norm": 0.40498992800712585, + "learning_rate": 9.541358712159562e-05, + "loss": 1.6699, + "step": 916 + }, + { + "epoch": 0.2783426923660646, + "grad_norm": 0.39011168479919434, + "learning_rate": 9.540852485572543e-05, + "loss": 1.9935, + "step": 917 + }, + { + "epoch": 0.2786462285627561, + "grad_norm": 0.3864549696445465, + "learning_rate": 9.540346258985522e-05, + "loss": 1.8271, + "step": 918 + }, + { + "epoch": 0.27894976475944755, + "grad_norm": 0.33493247628211975, + "learning_rate": 9.539840032398502e-05, + "loss": 1.856, + "step": 919 + }, + { + "epoch": 0.27925330095613904, + "grad_norm": 0.34132060408592224, + "learning_rate": 9.539333805811481e-05, + "loss": 1.8836, + "step": 920 + }, + { + "epoch": 0.2795568371528305, + "grad_norm": 1.5312176942825317, + "learning_rate": 9.538827579224461e-05, + "loss": 2.0207, + "step": 921 + }, + { + "epoch": 0.2798603733495219, + "grad_norm": 0.333932489156723, + "learning_rate": 9.53832135263744e-05, + "loss": 2.0908, + "step": 922 + }, + { + "epoch": 0.2801639095462134, + "grad_norm": 0.3688269555568695, + "learning_rate": 9.537815126050421e-05, + "loss": 1.8464, + "step": 923 + }, + { + "epoch": 0.28046744574290483, + "grad_norm": 0.4097294211387634, + "learning_rate": 9.5373088994634e-05, + "loss": 1.6891, + "step": 924 + }, + { + "epoch": 0.2807709819395963, + "grad_norm": 0.3737453818321228, + "learning_rate": 9.53680267287638e-05, + "loss": 2.0549, + "step": 925 + }, + { + "epoch": 0.28107451813628775, + "grad_norm": 0.6109428405761719, + "learning_rate": 9.53629644628936e-05, + "loss": 1.9437, + "step": 926 + }, + { + "epoch": 0.2813780543329792, + "grad_norm": 0.46215322613716125, + "learning_rate": 9.535790219702339e-05, + "loss": 1.5133, + "step": 927 + }, + { + "epoch": 0.2816815905296707, + "grad_norm": 0.8070108294487, + "learning_rate": 9.53528399311532e-05, + "loss": 1.8843, + "step": 928 + }, + { + "epoch": 0.2819851267263621, + "grad_norm": 0.40304142236709595, + "learning_rate": 9.534777766528299e-05, + "loss": 1.9742, + "step": 929 + }, + { + "epoch": 0.2822886629230536, + "grad_norm": 0.35046708583831787, + "learning_rate": 9.534271539941279e-05, + "loss": 1.8969, + "step": 930 + }, + { + "epoch": 0.28259219911974504, + "grad_norm": 0.37241777777671814, + "learning_rate": 9.533765313354258e-05, + "loss": 1.8138, + "step": 931 + }, + { + "epoch": 0.28289573531643647, + "grad_norm": 0.38689473271369934, + "learning_rate": 9.533259086767238e-05, + "loss": 1.669, + "step": 932 + }, + { + "epoch": 0.28319927151312796, + "grad_norm": 0.3672066926956177, + "learning_rate": 9.532752860180217e-05, + "loss": 1.9093, + "step": 933 + }, + { + "epoch": 0.2835028077098194, + "grad_norm": 0.4022217392921448, + "learning_rate": 9.532246633593197e-05, + "loss": 1.6959, + "step": 934 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.3894721269607544, + "learning_rate": 9.531740407006176e-05, + "loss": 1.9898, + "step": 935 + }, + { + "epoch": 0.2841098801032023, + "grad_norm": 0.4395015835762024, + "learning_rate": 9.531234180419156e-05, + "loss": 1.5538, + "step": 936 + }, + { + "epoch": 0.28441341629989375, + "grad_norm": 0.8121886849403381, + "learning_rate": 9.530727953832136e-05, + "loss": 1.7403, + "step": 937 + }, + { + "epoch": 0.28471695249658524, + "grad_norm": 0.40073227882385254, + "learning_rate": 9.530221727245116e-05, + "loss": 2.0544, + "step": 938 + }, + { + "epoch": 0.2850204886932767, + "grad_norm": 0.3571331202983856, + "learning_rate": 9.529715500658095e-05, + "loss": 1.7157, + "step": 939 + }, + { + "epoch": 0.2853240248899681, + "grad_norm": 0.485147625207901, + "learning_rate": 9.529209274071075e-05, + "loss": 2.1489, + "step": 940 + }, + { + "epoch": 0.2856275610866596, + "grad_norm": 0.6882160305976868, + "learning_rate": 9.528703047484054e-05, + "loss": 1.8458, + "step": 941 + }, + { + "epoch": 0.28593109728335103, + "grad_norm": 0.7156968116760254, + "learning_rate": 9.528196820897034e-05, + "loss": 1.9529, + "step": 942 + }, + { + "epoch": 0.2862346334800425, + "grad_norm": 0.4198112487792969, + "learning_rate": 9.527690594310013e-05, + "loss": 2.0355, + "step": 943 + }, + { + "epoch": 0.28653816967673396, + "grad_norm": 0.4178343117237091, + "learning_rate": 9.527184367722993e-05, + "loss": 1.5801, + "step": 944 + }, + { + "epoch": 0.2868417058734254, + "grad_norm": 0.3721866011619568, + "learning_rate": 9.526678141135972e-05, + "loss": 2.1657, + "step": 945 + }, + { + "epoch": 0.2871452420701169, + "grad_norm": 0.38586944341659546, + "learning_rate": 9.526171914548953e-05, + "loss": 1.4879, + "step": 946 + }, + { + "epoch": 0.2874487782668083, + "grad_norm": 0.42727598547935486, + "learning_rate": 9.525665687961933e-05, + "loss": 1.8434, + "step": 947 + }, + { + "epoch": 0.28775231446349975, + "grad_norm": 0.3686284124851227, + "learning_rate": 9.525159461374912e-05, + "loss": 1.9346, + "step": 948 + }, + { + "epoch": 0.28805585066019124, + "grad_norm": 0.41984260082244873, + "learning_rate": 9.524653234787892e-05, + "loss": 1.4474, + "step": 949 + }, + { + "epoch": 0.28835938685688267, + "grad_norm": 0.4530123174190521, + "learning_rate": 9.524147008200871e-05, + "loss": 1.6863, + "step": 950 + }, + { + "epoch": 0.28866292305357416, + "grad_norm": 0.40047594904899597, + "learning_rate": 9.52364078161385e-05, + "loss": 1.908, + "step": 951 + }, + { + "epoch": 0.2889664592502656, + "grad_norm": 0.3757762610912323, + "learning_rate": 9.52313455502683e-05, + "loss": 1.6235, + "step": 952 + }, + { + "epoch": 0.28926999544695703, + "grad_norm": 0.4337126612663269, + "learning_rate": 9.52262832843981e-05, + "loss": 1.6229, + "step": 953 + }, + { + "epoch": 0.2895735316436485, + "grad_norm": 0.4407886564731598, + "learning_rate": 9.522122101852789e-05, + "loss": 1.875, + "step": 954 + }, + { + "epoch": 0.28987706784033995, + "grad_norm": 0.5278657674789429, + "learning_rate": 9.521615875265768e-05, + "loss": 1.7199, + "step": 955 + }, + { + "epoch": 0.29018060403703144, + "grad_norm": 0.4441334307193756, + "learning_rate": 9.521109648678749e-05, + "loss": 1.1319, + "step": 956 + }, + { + "epoch": 0.2904841402337229, + "grad_norm": 0.3992663025856018, + "learning_rate": 9.520603422091729e-05, + "loss": 1.6948, + "step": 957 + }, + { + "epoch": 0.2907876764304143, + "grad_norm": 0.3979544937610626, + "learning_rate": 9.520097195504708e-05, + "loss": 1.8689, + "step": 958 + }, + { + "epoch": 0.2910912126271058, + "grad_norm": 0.4011298418045044, + "learning_rate": 9.519590968917688e-05, + "loss": 1.9491, + "step": 959 + }, + { + "epoch": 0.29139474882379723, + "grad_norm": 0.4377354383468628, + "learning_rate": 9.519084742330667e-05, + "loss": 1.7274, + "step": 960 + }, + { + "epoch": 0.29169828502048867, + "grad_norm": 0.5056617856025696, + "learning_rate": 9.518578515743647e-05, + "loss": 2.006, + "step": 961 + }, + { + "epoch": 0.29200182121718016, + "grad_norm": 0.36736002564430237, + "learning_rate": 9.518072289156626e-05, + "loss": 1.6558, + "step": 962 + }, + { + "epoch": 0.2923053574138716, + "grad_norm": 0.37966540455818176, + "learning_rate": 9.517566062569606e-05, + "loss": 2.0098, + "step": 963 + }, + { + "epoch": 0.2926088936105631, + "grad_norm": 0.4026505947113037, + "learning_rate": 9.517059835982585e-05, + "loss": 1.868, + "step": 964 + }, + { + "epoch": 0.2929124298072545, + "grad_norm": 0.461910218000412, + "learning_rate": 9.516553609395566e-05, + "loss": 2.1131, + "step": 965 + }, + { + "epoch": 0.29321596600394595, + "grad_norm": 0.4329175651073456, + "learning_rate": 9.516047382808545e-05, + "loss": 2.0068, + "step": 966 + }, + { + "epoch": 0.29351950220063744, + "grad_norm": 0.7611956000328064, + "learning_rate": 9.515541156221526e-05, + "loss": 1.9177, + "step": 967 + }, + { + "epoch": 0.2938230383973289, + "grad_norm": 0.6180218458175659, + "learning_rate": 9.515034929634506e-05, + "loss": 1.5603, + "step": 968 + }, + { + "epoch": 0.29412657459402036, + "grad_norm": 0.6556726694107056, + "learning_rate": 9.514528703047485e-05, + "loss": 2.1081, + "step": 969 + }, + { + "epoch": 0.2944301107907118, + "grad_norm": 0.3379404842853546, + "learning_rate": 9.514022476460465e-05, + "loss": 1.9701, + "step": 970 + }, + { + "epoch": 0.29473364698740323, + "grad_norm": 0.42676112055778503, + "learning_rate": 9.513516249873444e-05, + "loss": 1.6116, + "step": 971 + }, + { + "epoch": 0.2950371831840947, + "grad_norm": 0.35374894738197327, + "learning_rate": 9.513010023286424e-05, + "loss": 2.0621, + "step": 972 + }, + { + "epoch": 0.29534071938078615, + "grad_norm": 0.33012476563453674, + "learning_rate": 9.512503796699403e-05, + "loss": 1.4534, + "step": 973 + }, + { + "epoch": 0.2956442555774776, + "grad_norm": 0.37993383407592773, + "learning_rate": 9.511997570112383e-05, + "loss": 1.6306, + "step": 974 + }, + { + "epoch": 0.2959477917741691, + "grad_norm": 0.47140204906463623, + "learning_rate": 9.511491343525362e-05, + "loss": 2.0465, + "step": 975 + }, + { + "epoch": 0.2962513279708605, + "grad_norm": 0.40235936641693115, + "learning_rate": 9.510985116938343e-05, + "loss": 1.8247, + "step": 976 + }, + { + "epoch": 0.296554864167552, + "grad_norm": 0.3992665112018585, + "learning_rate": 9.510478890351322e-05, + "loss": 1.5702, + "step": 977 + }, + { + "epoch": 0.29685840036424344, + "grad_norm": 0.4469521641731262, + "learning_rate": 9.509972663764302e-05, + "loss": 1.8811, + "step": 978 + }, + { + "epoch": 0.29716193656093487, + "grad_norm": 0.41400644183158875, + "learning_rate": 9.509466437177281e-05, + "loss": 1.5374, + "step": 979 + }, + { + "epoch": 0.29746547275762636, + "grad_norm": 0.36348387598991394, + "learning_rate": 9.508960210590261e-05, + "loss": 1.9022, + "step": 980 + }, + { + "epoch": 0.2977690089543178, + "grad_norm": 0.4069242477416992, + "learning_rate": 9.50845398400324e-05, + "loss": 2.0066, + "step": 981 + }, + { + "epoch": 0.2980725451510093, + "grad_norm": 0.3684113323688507, + "learning_rate": 9.50794775741622e-05, + "loss": 1.8972, + "step": 982 + }, + { + "epoch": 0.2983760813477007, + "grad_norm": 0.40827688574790955, + "learning_rate": 9.5074415308292e-05, + "loss": 2.0659, + "step": 983 + }, + { + "epoch": 0.29867961754439215, + "grad_norm": 0.32065409421920776, + "learning_rate": 9.506935304242179e-05, + "loss": 2.0008, + "step": 984 + }, + { + "epoch": 0.29898315374108364, + "grad_norm": 0.38805294036865234, + "learning_rate": 9.50642907765516e-05, + "loss": 1.5027, + "step": 985 + }, + { + "epoch": 0.2992866899377751, + "grad_norm": 0.3656708896160126, + "learning_rate": 9.505922851068139e-05, + "loss": 1.7931, + "step": 986 + }, + { + "epoch": 0.2995902261344665, + "grad_norm": 0.4354289770126343, + "learning_rate": 9.505416624481119e-05, + "loss": 2.1183, + "step": 987 + }, + { + "epoch": 0.299893762331158, + "grad_norm": 0.3970641493797302, + "learning_rate": 9.504910397894098e-05, + "loss": 1.8188, + "step": 988 + }, + { + "epoch": 0.30019729852784943, + "grad_norm": 0.35527995228767395, + "learning_rate": 9.504404171307078e-05, + "loss": 1.6329, + "step": 989 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.4018630385398865, + "learning_rate": 9.503897944720057e-05, + "loss": 1.993, + "step": 990 + }, + { + "epoch": 0.30080437092123236, + "grad_norm": 0.36514052748680115, + "learning_rate": 9.503391718133037e-05, + "loss": 2.0482, + "step": 991 + }, + { + "epoch": 0.3011079071179238, + "grad_norm": 0.3790993094444275, + "learning_rate": 9.502885491546016e-05, + "loss": 2.0286, + "step": 992 + }, + { + "epoch": 0.3014114433146153, + "grad_norm": 0.314779669046402, + "learning_rate": 9.502379264958995e-05, + "loss": 1.8135, + "step": 993 + }, + { + "epoch": 0.3017149795113067, + "grad_norm": 0.42383378744125366, + "learning_rate": 9.501873038371975e-05, + "loss": 1.8783, + "step": 994 + }, + { + "epoch": 0.3020185157079982, + "grad_norm": 0.4036683738231659, + "learning_rate": 9.501366811784956e-05, + "loss": 1.6091, + "step": 995 + }, + { + "epoch": 0.30232205190468964, + "grad_norm": 0.3611324429512024, + "learning_rate": 9.500860585197935e-05, + "loss": 1.3388, + "step": 996 + }, + { + "epoch": 0.30262558810138107, + "grad_norm": 0.44210389256477356, + "learning_rate": 9.500354358610915e-05, + "loss": 1.6133, + "step": 997 + }, + { + "epoch": 0.30292912429807256, + "grad_norm": 0.37780526280403137, + "learning_rate": 9.499848132023894e-05, + "loss": 1.9993, + "step": 998 + }, + { + "epoch": 0.303232660494764, + "grad_norm": 0.469959557056427, + "learning_rate": 9.499341905436874e-05, + "loss": 1.8094, + "step": 999 + }, + { + "epoch": 0.30353619669145543, + "grad_norm": 0.38992664217948914, + "learning_rate": 9.498835678849853e-05, + "loss": 1.8975, + "step": 1000 + }, + { + "epoch": 0.3038397328881469, + "grad_norm": 0.44024091958999634, + "learning_rate": 9.498329452262833e-05, + "loss": 1.7081, + "step": 1001 + }, + { + "epoch": 0.30414326908483835, + "grad_norm": 0.32488685846328735, + "learning_rate": 9.497823225675812e-05, + "loss": 1.4921, + "step": 1002 + }, + { + "epoch": 0.30444680528152984, + "grad_norm": 0.7046712636947632, + "learning_rate": 9.497316999088792e-05, + "loss": 1.9693, + "step": 1003 + }, + { + "epoch": 0.3047503414782213, + "grad_norm": 0.39591220021247864, + "learning_rate": 9.496810772501772e-05, + "loss": 2.0266, + "step": 1004 + }, + { + "epoch": 0.3050538776749127, + "grad_norm": 0.371804416179657, + "learning_rate": 9.496304545914752e-05, + "loss": 1.9906, + "step": 1005 + }, + { + "epoch": 0.3053574138716042, + "grad_norm": 0.32893630862236023, + "learning_rate": 9.495798319327731e-05, + "loss": 1.9469, + "step": 1006 + }, + { + "epoch": 0.30566095006829563, + "grad_norm": 0.406531423330307, + "learning_rate": 9.495292092740711e-05, + "loss": 1.7575, + "step": 1007 + }, + { + "epoch": 0.3059644862649871, + "grad_norm": 0.3299405872821808, + "learning_rate": 9.49478586615369e-05, + "loss": 1.6457, + "step": 1008 + }, + { + "epoch": 0.30626802246167856, + "grad_norm": 0.40007394552230835, + "learning_rate": 9.49427963956667e-05, + "loss": 1.9291, + "step": 1009 + }, + { + "epoch": 0.30657155865837, + "grad_norm": 0.41286107897758484, + "learning_rate": 9.49377341297965e-05, + "loss": 1.9869, + "step": 1010 + }, + { + "epoch": 0.3068750948550615, + "grad_norm": 0.6297092437744141, + "learning_rate": 9.493267186392629e-05, + "loss": 2.1354, + "step": 1011 + }, + { + "epoch": 0.3071786310517529, + "grad_norm": 0.4763343334197998, + "learning_rate": 9.49276095980561e-05, + "loss": 1.6641, + "step": 1012 + }, + { + "epoch": 0.3074821672484444, + "grad_norm": 0.343218058347702, + "learning_rate": 9.492254733218589e-05, + "loss": 1.9556, + "step": 1013 + }, + { + "epoch": 0.30778570344513584, + "grad_norm": 0.4180206060409546, + "learning_rate": 9.491748506631569e-05, + "loss": 2.064, + "step": 1014 + }, + { + "epoch": 0.3080892396418273, + "grad_norm": 0.3307478725910187, + "learning_rate": 9.49124228004455e-05, + "loss": 1.9579, + "step": 1015 + }, + { + "epoch": 0.30839277583851876, + "grad_norm": 0.31935417652130127, + "learning_rate": 9.490736053457529e-05, + "loss": 2.0038, + "step": 1016 + }, + { + "epoch": 0.3086963120352102, + "grad_norm": 0.4078797399997711, + "learning_rate": 9.490229826870508e-05, + "loss": 1.6727, + "step": 1017 + }, + { + "epoch": 0.30899984823190163, + "grad_norm": 0.4393940269947052, + "learning_rate": 9.489723600283488e-05, + "loss": 1.9709, + "step": 1018 + }, + { + "epoch": 0.3093033844285931, + "grad_norm": 0.41586485505104065, + "learning_rate": 9.489217373696467e-05, + "loss": 1.9976, + "step": 1019 + }, + { + "epoch": 0.30960692062528455, + "grad_norm": 0.32988855242729187, + "learning_rate": 9.488711147109447e-05, + "loss": 2.1278, + "step": 1020 + }, + { + "epoch": 0.30991045682197604, + "grad_norm": 0.47184863686561584, + "learning_rate": 9.488204920522426e-05, + "loss": 1.8132, + "step": 1021 + }, + { + "epoch": 0.3102139930186675, + "grad_norm": 0.32716313004493713, + "learning_rate": 9.487698693935406e-05, + "loss": 1.6124, + "step": 1022 + }, + { + "epoch": 0.3105175292153589, + "grad_norm": 0.46906420588493347, + "learning_rate": 9.487192467348385e-05, + "loss": 1.9718, + "step": 1023 + }, + { + "epoch": 0.3108210654120504, + "grad_norm": 0.3436840772628784, + "learning_rate": 9.486686240761366e-05, + "loss": 1.809, + "step": 1024 + }, + { + "epoch": 0.31112460160874184, + "grad_norm": 0.39674249291419983, + "learning_rate": 9.486180014174346e-05, + "loss": 1.5307, + "step": 1025 + }, + { + "epoch": 0.3114281378054333, + "grad_norm": 0.40978574752807617, + "learning_rate": 9.485673787587325e-05, + "loss": 2.0251, + "step": 1026 + }, + { + "epoch": 0.31173167400212476, + "grad_norm": 0.39651399850845337, + "learning_rate": 9.485167561000305e-05, + "loss": 1.8872, + "step": 1027 + }, + { + "epoch": 0.3120352101988162, + "grad_norm": 0.7730064988136292, + "learning_rate": 9.484661334413284e-05, + "loss": 1.7339, + "step": 1028 + }, + { + "epoch": 0.3123387463955077, + "grad_norm": 0.36178770661354065, + "learning_rate": 9.484155107826264e-05, + "loss": 1.8594, + "step": 1029 + }, + { + "epoch": 0.3126422825921991, + "grad_norm": 0.4153605103492737, + "learning_rate": 9.483648881239243e-05, + "loss": 1.8687, + "step": 1030 + }, + { + "epoch": 0.31294581878889055, + "grad_norm": 0.41472381353378296, + "learning_rate": 9.483142654652222e-05, + "loss": 1.9665, + "step": 1031 + }, + { + "epoch": 0.31324935498558204, + "grad_norm": 0.3871115744113922, + "learning_rate": 9.482636428065202e-05, + "loss": 2.1858, + "step": 1032 + }, + { + "epoch": 0.3135528911822735, + "grad_norm": 0.33978626132011414, + "learning_rate": 9.482130201478181e-05, + "loss": 1.5615, + "step": 1033 + }, + { + "epoch": 0.31385642737896496, + "grad_norm": 0.33726009726524353, + "learning_rate": 9.481623974891162e-05, + "loss": 2.1119, + "step": 1034 + }, + { + "epoch": 0.3141599635756564, + "grad_norm": 0.35080355405807495, + "learning_rate": 9.481117748304142e-05, + "loss": 1.9497, + "step": 1035 + }, + { + "epoch": 0.31446349977234783, + "grad_norm": 0.37655749917030334, + "learning_rate": 9.480611521717121e-05, + "loss": 1.6486, + "step": 1036 + }, + { + "epoch": 0.3147670359690393, + "grad_norm": 0.3838097155094147, + "learning_rate": 9.480105295130101e-05, + "loss": 2.0504, + "step": 1037 + }, + { + "epoch": 0.31507057216573076, + "grad_norm": 0.3412497341632843, + "learning_rate": 9.47959906854308e-05, + "loss": 1.8417, + "step": 1038 + }, + { + "epoch": 0.31537410836242225, + "grad_norm": 0.3633384108543396, + "learning_rate": 9.47909284195606e-05, + "loss": 1.9713, + "step": 1039 + }, + { + "epoch": 0.3156776445591137, + "grad_norm": 0.332861989736557, + "learning_rate": 9.478586615369039e-05, + "loss": 1.8967, + "step": 1040 + }, + { + "epoch": 0.3159811807558051, + "grad_norm": 0.5054538249969482, + "learning_rate": 9.478080388782019e-05, + "loss": 1.8217, + "step": 1041 + }, + { + "epoch": 0.3162847169524966, + "grad_norm": 0.30825376510620117, + "learning_rate": 9.477574162194998e-05, + "loss": 1.8026, + "step": 1042 + }, + { + "epoch": 0.31658825314918804, + "grad_norm": 0.3759863078594208, + "learning_rate": 9.477067935607979e-05, + "loss": 1.6662, + "step": 1043 + }, + { + "epoch": 0.31689178934587947, + "grad_norm": 0.36408594250679016, + "learning_rate": 9.476561709020958e-05, + "loss": 2.3524, + "step": 1044 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.38226181268692017, + "learning_rate": 9.476055482433938e-05, + "loss": 1.8966, + "step": 1045 + }, + { + "epoch": 0.3174988617392624, + "grad_norm": 0.35480546951293945, + "learning_rate": 9.475549255846917e-05, + "loss": 1.9114, + "step": 1046 + }, + { + "epoch": 0.3178023979359539, + "grad_norm": 0.378701776266098, + "learning_rate": 9.475043029259897e-05, + "loss": 1.9151, + "step": 1047 + }, + { + "epoch": 0.3181059341326453, + "grad_norm": 1.0800230503082275, + "learning_rate": 9.474536802672876e-05, + "loss": 1.3396, + "step": 1048 + }, + { + "epoch": 0.31840947032933675, + "grad_norm": 0.4015067219734192, + "learning_rate": 9.474030576085856e-05, + "loss": 1.6889, + "step": 1049 + }, + { + "epoch": 0.31871300652602824, + "grad_norm": 0.35431405901908875, + "learning_rate": 9.473524349498835e-05, + "loss": 1.4716, + "step": 1050 + }, + { + "epoch": 0.3190165427227197, + "grad_norm": 0.4030434787273407, + "learning_rate": 9.473018122911815e-05, + "loss": 1.6192, + "step": 1051 + }, + { + "epoch": 0.31932007891941117, + "grad_norm": 0.4005342423915863, + "learning_rate": 9.472511896324796e-05, + "loss": 1.5092, + "step": 1052 + }, + { + "epoch": 0.3196236151161026, + "grad_norm": 1.130418062210083, + "learning_rate": 9.472005669737775e-05, + "loss": 1.5802, + "step": 1053 + }, + { + "epoch": 0.31992715131279403, + "grad_norm": 0.41232943534851074, + "learning_rate": 9.471499443150755e-05, + "loss": 2.0205, + "step": 1054 + }, + { + "epoch": 0.3202306875094855, + "grad_norm": 0.4155721366405487, + "learning_rate": 9.470993216563734e-05, + "loss": 1.88, + "step": 1055 + }, + { + "epoch": 0.32053422370617696, + "grad_norm": 0.36597010493278503, + "learning_rate": 9.470486989976715e-05, + "loss": 1.9922, + "step": 1056 + }, + { + "epoch": 0.3208377599028684, + "grad_norm": 0.8094148635864258, + "learning_rate": 9.469980763389694e-05, + "loss": 1.8267, + "step": 1057 + }, + { + "epoch": 0.3211412960995599, + "grad_norm": 0.36358359456062317, + "learning_rate": 9.469474536802674e-05, + "loss": 1.5307, + "step": 1058 + }, + { + "epoch": 0.3214448322962513, + "grad_norm": 0.400796502828598, + "learning_rate": 9.468968310215653e-05, + "loss": 1.9742, + "step": 1059 + }, + { + "epoch": 0.3217483684929428, + "grad_norm": 0.3251611888408661, + "learning_rate": 9.468462083628633e-05, + "loss": 1.7736, + "step": 1060 + }, + { + "epoch": 0.32205190468963424, + "grad_norm": 0.4060586988925934, + "learning_rate": 9.467955857041612e-05, + "loss": 1.7211, + "step": 1061 + }, + { + "epoch": 0.3223554408863257, + "grad_norm": 0.4181293547153473, + "learning_rate": 9.467449630454592e-05, + "loss": 1.5085, + "step": 1062 + }, + { + "epoch": 0.32265897708301716, + "grad_norm": 0.3514660894870758, + "learning_rate": 9.466943403867573e-05, + "loss": 1.8939, + "step": 1063 + }, + { + "epoch": 0.3229625132797086, + "grad_norm": 0.3337076008319855, + "learning_rate": 9.466437177280552e-05, + "loss": 1.6281, + "step": 1064 + }, + { + "epoch": 0.3232660494764001, + "grad_norm": 0.39011150598526, + "learning_rate": 9.465930950693532e-05, + "loss": 2.3316, + "step": 1065 + }, + { + "epoch": 0.3235695856730915, + "grad_norm": 0.42054951190948486, + "learning_rate": 9.465424724106511e-05, + "loss": 1.9249, + "step": 1066 + }, + { + "epoch": 0.32387312186978295, + "grad_norm": 0.37516888976097107, + "learning_rate": 9.46491849751949e-05, + "loss": 1.9643, + "step": 1067 + }, + { + "epoch": 0.32417665806647444, + "grad_norm": 0.3549358546733856, + "learning_rate": 9.46441227093247e-05, + "loss": 1.9069, + "step": 1068 + }, + { + "epoch": 0.3244801942631659, + "grad_norm": 1.4541680812835693, + "learning_rate": 9.46390604434545e-05, + "loss": 1.4868, + "step": 1069 + }, + { + "epoch": 0.3247837304598573, + "grad_norm": 0.31561896204948425, + "learning_rate": 9.463399817758429e-05, + "loss": 1.8972, + "step": 1070 + }, + { + "epoch": 0.3250872666565488, + "grad_norm": 0.35816720128059387, + "learning_rate": 9.462893591171408e-05, + "loss": 2.0286, + "step": 1071 + }, + { + "epoch": 0.32539080285324024, + "grad_norm": 0.38618069887161255, + "learning_rate": 9.462387364584388e-05, + "loss": 1.7604, + "step": 1072 + }, + { + "epoch": 0.3256943390499317, + "grad_norm": 0.42617419362068176, + "learning_rate": 9.461881137997369e-05, + "loss": 1.2767, + "step": 1073 + }, + { + "epoch": 0.32599787524662316, + "grad_norm": 0.3996577262878418, + "learning_rate": 9.461374911410348e-05, + "loss": 2.0023, + "step": 1074 + }, + { + "epoch": 0.3263014114433146, + "grad_norm": 0.6627565026283264, + "learning_rate": 9.460868684823328e-05, + "loss": 2.2386, + "step": 1075 + }, + { + "epoch": 0.3266049476400061, + "grad_norm": 0.3753213882446289, + "learning_rate": 9.460362458236307e-05, + "loss": 1.6935, + "step": 1076 + }, + { + "epoch": 0.3269084838366975, + "grad_norm": 0.4097970724105835, + "learning_rate": 9.459856231649287e-05, + "loss": 1.59, + "step": 1077 + }, + { + "epoch": 0.327212020033389, + "grad_norm": 0.39637240767478943, + "learning_rate": 9.459350005062266e-05, + "loss": 1.5338, + "step": 1078 + }, + { + "epoch": 0.32751555623008044, + "grad_norm": 0.38365036249160767, + "learning_rate": 9.458843778475246e-05, + "loss": 2.0128, + "step": 1079 + }, + { + "epoch": 0.3278190924267719, + "grad_norm": 0.42568036913871765, + "learning_rate": 9.458337551888225e-05, + "loss": 1.3282, + "step": 1080 + }, + { + "epoch": 0.32812262862346336, + "grad_norm": 0.4248203933238983, + "learning_rate": 9.457831325301205e-05, + "loss": 1.9059, + "step": 1081 + }, + { + "epoch": 0.3284261648201548, + "grad_norm": 0.37200963497161865, + "learning_rate": 9.457325098714185e-05, + "loss": 2.085, + "step": 1082 + }, + { + "epoch": 0.32872970101684623, + "grad_norm": 0.44390764832496643, + "learning_rate": 9.456818872127165e-05, + "loss": 1.9431, + "step": 1083 + }, + { + "epoch": 0.3290332372135377, + "grad_norm": 0.44483283162117004, + "learning_rate": 9.456312645540144e-05, + "loss": 2.088, + "step": 1084 + }, + { + "epoch": 0.32933677341022916, + "grad_norm": 0.3765670955181122, + "learning_rate": 9.455806418953124e-05, + "loss": 2.0446, + "step": 1085 + }, + { + "epoch": 0.32964030960692065, + "grad_norm": 0.428964763879776, + "learning_rate": 9.455300192366103e-05, + "loss": 1.9831, + "step": 1086 + }, + { + "epoch": 0.3299438458036121, + "grad_norm": 0.3957151770591736, + "learning_rate": 9.454793965779083e-05, + "loss": 1.7817, + "step": 1087 + }, + { + "epoch": 0.3302473820003035, + "grad_norm": 0.3726184368133545, + "learning_rate": 9.454287739192062e-05, + "loss": 1.9929, + "step": 1088 + }, + { + "epoch": 0.330550918196995, + "grad_norm": 0.41574302315711975, + "learning_rate": 9.453781512605042e-05, + "loss": 2.0094, + "step": 1089 + }, + { + "epoch": 0.33085445439368644, + "grad_norm": 0.36284613609313965, + "learning_rate": 9.453275286018021e-05, + "loss": 2.0273, + "step": 1090 + }, + { + "epoch": 0.3311579905903779, + "grad_norm": 0.48810014128685, + "learning_rate": 9.452769059431002e-05, + "loss": 1.4371, + "step": 1091 + }, + { + "epoch": 0.33146152678706936, + "grad_norm": 0.3929893672466278, + "learning_rate": 9.452262832843982e-05, + "loss": 2.0663, + "step": 1092 + }, + { + "epoch": 0.3317650629837608, + "grad_norm": 0.401722252368927, + "learning_rate": 9.451756606256961e-05, + "loss": 1.6119, + "step": 1093 + }, + { + "epoch": 0.3320685991804523, + "grad_norm": 0.42032745480537415, + "learning_rate": 9.45125037966994e-05, + "loss": 1.7541, + "step": 1094 + }, + { + "epoch": 0.3323721353771437, + "grad_norm": 0.3663571774959564, + "learning_rate": 9.45074415308292e-05, + "loss": 1.4438, + "step": 1095 + }, + { + "epoch": 0.33267567157383515, + "grad_norm": 0.397624671459198, + "learning_rate": 9.4502379264959e-05, + "loss": 2.0996, + "step": 1096 + }, + { + "epoch": 0.33297920777052664, + "grad_norm": 0.3914051651954651, + "learning_rate": 9.449731699908879e-05, + "loss": 1.5906, + "step": 1097 + }, + { + "epoch": 0.3332827439672181, + "grad_norm": 0.3951834440231323, + "learning_rate": 9.449225473321858e-05, + "loss": 1.9128, + "step": 1098 + }, + { + "epoch": 0.33358628016390957, + "grad_norm": 0.363696426153183, + "learning_rate": 9.448719246734838e-05, + "loss": 1.3447, + "step": 1099 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.3522724211215973, + "learning_rate": 9.448213020147817e-05, + "loss": 1.6755, + "step": 1100 + }, + { + "epoch": 0.33419335255729243, + "grad_norm": 1.1290934085845947, + "learning_rate": 9.447706793560798e-05, + "loss": 1.6796, + "step": 1101 + }, + { + "epoch": 0.3344968887539839, + "grad_norm": 0.3527061939239502, + "learning_rate": 9.447200566973779e-05, + "loss": 1.4917, + "step": 1102 + }, + { + "epoch": 0.33480042495067536, + "grad_norm": 0.3770875930786133, + "learning_rate": 9.446694340386759e-05, + "loss": 1.2733, + "step": 1103 + }, + { + "epoch": 0.33510396114736685, + "grad_norm": 0.3742992579936981, + "learning_rate": 9.446188113799738e-05, + "loss": 1.8584, + "step": 1104 + }, + { + "epoch": 0.3354074973440583, + "grad_norm": 0.7284528017044067, + "learning_rate": 9.445681887212718e-05, + "loss": 2.0183, + "step": 1105 + }, + { + "epoch": 0.3357110335407497, + "grad_norm": 0.37331897020339966, + "learning_rate": 9.445175660625697e-05, + "loss": 1.9604, + "step": 1106 + }, + { + "epoch": 0.3360145697374412, + "grad_norm": 0.3642507791519165, + "learning_rate": 9.444669434038676e-05, + "loss": 1.8661, + "step": 1107 + }, + { + "epoch": 0.33631810593413264, + "grad_norm": 0.4249272346496582, + "learning_rate": 9.444163207451656e-05, + "loss": 2.039, + "step": 1108 + }, + { + "epoch": 0.33662164213082413, + "grad_norm": 0.5299102067947388, + "learning_rate": 9.443656980864635e-05, + "loss": 1.618, + "step": 1109 + }, + { + "epoch": 0.33692517832751556, + "grad_norm": 0.37671583890914917, + "learning_rate": 9.443150754277615e-05, + "loss": 1.5634, + "step": 1110 + }, + { + "epoch": 0.337228714524207, + "grad_norm": 0.9504343867301941, + "learning_rate": 9.442644527690594e-05, + "loss": 1.7275, + "step": 1111 + }, + { + "epoch": 0.3375322507208985, + "grad_norm": 0.37230974435806274, + "learning_rate": 9.442138301103575e-05, + "loss": 1.9971, + "step": 1112 + }, + { + "epoch": 0.3378357869175899, + "grad_norm": 0.4015982449054718, + "learning_rate": 9.441632074516555e-05, + "loss": 2.0012, + "step": 1113 + }, + { + "epoch": 0.33813932311428135, + "grad_norm": 0.42521438002586365, + "learning_rate": 9.441125847929534e-05, + "loss": 2.1657, + "step": 1114 + }, + { + "epoch": 0.33844285931097284, + "grad_norm": 0.3954319953918457, + "learning_rate": 9.440619621342514e-05, + "loss": 1.7999, + "step": 1115 + }, + { + "epoch": 0.3387463955076643, + "grad_norm": 0.5241403579711914, + "learning_rate": 9.440113394755493e-05, + "loss": 2.0102, + "step": 1116 + }, + { + "epoch": 0.33904993170435577, + "grad_norm": 0.4186641275882721, + "learning_rate": 9.439607168168473e-05, + "loss": 1.854, + "step": 1117 + }, + { + "epoch": 0.3393534679010472, + "grad_norm": 0.4375157654285431, + "learning_rate": 9.439100941581452e-05, + "loss": 1.7774, + "step": 1118 + }, + { + "epoch": 0.33965700409773864, + "grad_norm": 0.43266987800598145, + "learning_rate": 9.438594714994432e-05, + "loss": 1.9328, + "step": 1119 + }, + { + "epoch": 0.3399605402944301, + "grad_norm": 0.5544857382774353, + "learning_rate": 9.438088488407411e-05, + "loss": 1.7284, + "step": 1120 + }, + { + "epoch": 0.34026407649112156, + "grad_norm": 0.39998582005500793, + "learning_rate": 9.437582261820392e-05, + "loss": 1.8724, + "step": 1121 + }, + { + "epoch": 0.34056761268781305, + "grad_norm": 0.41390395164489746, + "learning_rate": 9.437076035233371e-05, + "loss": 2.1188, + "step": 1122 + }, + { + "epoch": 0.3408711488845045, + "grad_norm": 0.4374658167362213, + "learning_rate": 9.436569808646351e-05, + "loss": 1.9946, + "step": 1123 + }, + { + "epoch": 0.3411746850811959, + "grad_norm": 0.3902375102043152, + "learning_rate": 9.43606358205933e-05, + "loss": 1.9352, + "step": 1124 + }, + { + "epoch": 0.3414782212778874, + "grad_norm": 0.4049385190010071, + "learning_rate": 9.43555735547231e-05, + "loss": 1.7674, + "step": 1125 + }, + { + "epoch": 0.34178175747457884, + "grad_norm": 0.42752334475517273, + "learning_rate": 9.435051128885289e-05, + "loss": 2.1489, + "step": 1126 + }, + { + "epoch": 0.3420852936712703, + "grad_norm": 0.3927367925643921, + "learning_rate": 9.434544902298269e-05, + "loss": 1.9454, + "step": 1127 + }, + { + "epoch": 0.34238882986796176, + "grad_norm": 1.4001588821411133, + "learning_rate": 9.434038675711248e-05, + "loss": 1.8791, + "step": 1128 + }, + { + "epoch": 0.3426923660646532, + "grad_norm": 0.3640120327472687, + "learning_rate": 9.433532449124228e-05, + "loss": 1.9714, + "step": 1129 + }, + { + "epoch": 0.3429959022613447, + "grad_norm": 0.3569428026676178, + "learning_rate": 9.433026222537209e-05, + "loss": 1.715, + "step": 1130 + }, + { + "epoch": 0.3432994384580361, + "grad_norm": 0.3593400716781616, + "learning_rate": 9.432519995950188e-05, + "loss": 1.9454, + "step": 1131 + }, + { + "epoch": 0.34360297465472756, + "grad_norm": 0.38255101442337036, + "learning_rate": 9.432013769363168e-05, + "loss": 1.9629, + "step": 1132 + }, + { + "epoch": 0.34390651085141904, + "grad_norm": 0.4099471867084503, + "learning_rate": 9.431507542776147e-05, + "loss": 1.9556, + "step": 1133 + }, + { + "epoch": 0.3442100470481105, + "grad_norm": 0.36562618613243103, + "learning_rate": 9.431001316189126e-05, + "loss": 1.3671, + "step": 1134 + }, + { + "epoch": 0.34451358324480197, + "grad_norm": 0.49943339824676514, + "learning_rate": 9.430495089602106e-05, + "loss": 1.8581, + "step": 1135 + }, + { + "epoch": 0.3448171194414934, + "grad_norm": 0.3707871437072754, + "learning_rate": 9.429988863015085e-05, + "loss": 2.0911, + "step": 1136 + }, + { + "epoch": 0.34512065563818484, + "grad_norm": 0.3699527382850647, + "learning_rate": 9.429482636428065e-05, + "loss": 2.0198, + "step": 1137 + }, + { + "epoch": 0.3454241918348763, + "grad_norm": 0.4300304055213928, + "learning_rate": 9.428976409841044e-05, + "loss": 2.2398, + "step": 1138 + }, + { + "epoch": 0.34572772803156776, + "grad_norm": 0.38733771443367004, + "learning_rate": 9.428470183254024e-05, + "loss": 1.9505, + "step": 1139 + }, + { + "epoch": 0.3460312642282592, + "grad_norm": 0.38434740900993347, + "learning_rate": 9.427963956667005e-05, + "loss": 2.0853, + "step": 1140 + }, + { + "epoch": 0.3463348004249507, + "grad_norm": 0.3448013961315155, + "learning_rate": 9.427457730079984e-05, + "loss": 1.816, + "step": 1141 + }, + { + "epoch": 0.3466383366216421, + "grad_norm": 2.17158842086792, + "learning_rate": 9.426951503492964e-05, + "loss": 1.9041, + "step": 1142 + }, + { + "epoch": 0.3469418728183336, + "grad_norm": 0.39879223704338074, + "learning_rate": 9.426445276905943e-05, + "loss": 1.8991, + "step": 1143 + }, + { + "epoch": 0.34724540901502504, + "grad_norm": 0.517691433429718, + "learning_rate": 9.425939050318923e-05, + "loss": 1.4864, + "step": 1144 + }, + { + "epoch": 0.3475489452117165, + "grad_norm": 0.4679596722126007, + "learning_rate": 9.425432823731903e-05, + "loss": 2.127, + "step": 1145 + }, + { + "epoch": 0.34785248140840797, + "grad_norm": 0.47220855951309204, + "learning_rate": 9.424926597144883e-05, + "loss": 1.1827, + "step": 1146 + }, + { + "epoch": 0.3481560176050994, + "grad_norm": 0.4707253575325012, + "learning_rate": 9.424420370557862e-05, + "loss": 1.6538, + "step": 1147 + }, + { + "epoch": 0.3484595538017909, + "grad_norm": 0.5610188245773315, + "learning_rate": 9.423914143970842e-05, + "loss": 2.0097, + "step": 1148 + }, + { + "epoch": 0.3487630899984823, + "grad_norm": 0.6568597555160522, + "learning_rate": 9.423407917383821e-05, + "loss": 1.8777, + "step": 1149 + }, + { + "epoch": 0.34906662619517376, + "grad_norm": 0.38883280754089355, + "learning_rate": 9.422901690796801e-05, + "loss": 1.8524, + "step": 1150 + }, + { + "epoch": 0.34937016239186525, + "grad_norm": 0.34381693601608276, + "learning_rate": 9.422395464209782e-05, + "loss": 1.7728, + "step": 1151 + }, + { + "epoch": 0.3496736985885567, + "grad_norm": 0.4320678412914276, + "learning_rate": 9.421889237622761e-05, + "loss": 1.6346, + "step": 1152 + }, + { + "epoch": 0.3499772347852481, + "grad_norm": 0.4651411771774292, + "learning_rate": 9.42138301103574e-05, + "loss": 2.008, + "step": 1153 + }, + { + "epoch": 0.3502807709819396, + "grad_norm": 0.5340977907180786, + "learning_rate": 9.42087678444872e-05, + "loss": 1.6537, + "step": 1154 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.3686065971851349, + "learning_rate": 9.4203705578617e-05, + "loss": 1.6585, + "step": 1155 + }, + { + "epoch": 0.35088784337532253, + "grad_norm": 0.4016922116279602, + "learning_rate": 9.419864331274679e-05, + "loss": 1.4874, + "step": 1156 + }, + { + "epoch": 0.35119137957201396, + "grad_norm": 0.4304169714450836, + "learning_rate": 9.419358104687659e-05, + "loss": 2.0125, + "step": 1157 + }, + { + "epoch": 0.3514949157687054, + "grad_norm": 0.3944842517375946, + "learning_rate": 9.418851878100638e-05, + "loss": 2.0397, + "step": 1158 + }, + { + "epoch": 0.3517984519653969, + "grad_norm": 0.3778032064437866, + "learning_rate": 9.418345651513618e-05, + "loss": 2.0629, + "step": 1159 + }, + { + "epoch": 0.3521019881620883, + "grad_norm": 0.4209291636943817, + "learning_rate": 9.417839424926598e-05, + "loss": 1.845, + "step": 1160 + }, + { + "epoch": 0.3524055243587798, + "grad_norm": 0.3948676586151123, + "learning_rate": 9.417333198339578e-05, + "loss": 1.9479, + "step": 1161 + }, + { + "epoch": 0.35270906055547124, + "grad_norm": 0.4018319547176361, + "learning_rate": 9.416826971752557e-05, + "loss": 1.9, + "step": 1162 + }, + { + "epoch": 0.3530125967521627, + "grad_norm": 0.42170947790145874, + "learning_rate": 9.416320745165537e-05, + "loss": 1.8034, + "step": 1163 + }, + { + "epoch": 0.35331613294885417, + "grad_norm": 0.3817223310470581, + "learning_rate": 9.415814518578516e-05, + "loss": 1.4884, + "step": 1164 + }, + { + "epoch": 0.3536196691455456, + "grad_norm": 0.35511648654937744, + "learning_rate": 9.415308291991496e-05, + "loss": 1.7336, + "step": 1165 + }, + { + "epoch": 0.35392320534223703, + "grad_norm": 0.45333489775657654, + "learning_rate": 9.414802065404475e-05, + "loss": 1.6451, + "step": 1166 + }, + { + "epoch": 0.3542267415389285, + "grad_norm": 0.42814895510673523, + "learning_rate": 9.414295838817455e-05, + "loss": 1.8122, + "step": 1167 + }, + { + "epoch": 0.35453027773561996, + "grad_norm": 0.39475324749946594, + "learning_rate": 9.413789612230434e-05, + "loss": 2.2534, + "step": 1168 + }, + { + "epoch": 0.35483381393231145, + "grad_norm": 0.41115859150886536, + "learning_rate": 9.413283385643415e-05, + "loss": 1.8317, + "step": 1169 + }, + { + "epoch": 0.3551373501290029, + "grad_norm": 0.44518032670021057, + "learning_rate": 9.412777159056395e-05, + "loss": 1.8648, + "step": 1170 + }, + { + "epoch": 0.3554408863256943, + "grad_norm": 0.3964219391345978, + "learning_rate": 9.412270932469374e-05, + "loss": 1.4205, + "step": 1171 + }, + { + "epoch": 0.3557444225223858, + "grad_norm": 0.3874772787094116, + "learning_rate": 9.411764705882353e-05, + "loss": 1.7562, + "step": 1172 + }, + { + "epoch": 0.35604795871907724, + "grad_norm": 0.35493049025535583, + "learning_rate": 9.411258479295333e-05, + "loss": 1.1856, + "step": 1173 + }, + { + "epoch": 0.35635149491576873, + "grad_norm": 0.3838149905204773, + "learning_rate": 9.410752252708312e-05, + "loss": 1.955, + "step": 1174 + }, + { + "epoch": 0.35665503111246016, + "grad_norm": 0.46874240040779114, + "learning_rate": 9.410246026121292e-05, + "loss": 1.7283, + "step": 1175 + }, + { + "epoch": 0.3569585673091516, + "grad_norm": 0.3673109710216522, + "learning_rate": 9.409739799534271e-05, + "loss": 1.8228, + "step": 1176 + }, + { + "epoch": 0.3572621035058431, + "grad_norm": 0.4494078457355499, + "learning_rate": 9.409233572947251e-05, + "loss": 1.6355, + "step": 1177 + }, + { + "epoch": 0.3575656397025345, + "grad_norm": 0.4009113609790802, + "learning_rate": 9.40872734636023e-05, + "loss": 1.7594, + "step": 1178 + }, + { + "epoch": 0.35786917589922596, + "grad_norm": 0.4051864445209503, + "learning_rate": 9.408221119773211e-05, + "loss": 1.7057, + "step": 1179 + }, + { + "epoch": 0.35817271209591744, + "grad_norm": 0.33628928661346436, + "learning_rate": 9.40771489318619e-05, + "loss": 1.8971, + "step": 1180 + }, + { + "epoch": 0.3584762482926089, + "grad_norm": 0.3441104590892792, + "learning_rate": 9.40720866659917e-05, + "loss": 1.8399, + "step": 1181 + }, + { + "epoch": 0.35877978448930037, + "grad_norm": 0.38719773292541504, + "learning_rate": 9.40670244001215e-05, + "loss": 2.0484, + "step": 1182 + }, + { + "epoch": 0.3590833206859918, + "grad_norm": 0.4182259142398834, + "learning_rate": 9.406196213425129e-05, + "loss": 1.525, + "step": 1183 + }, + { + "epoch": 0.35938685688268324, + "grad_norm": 0.42075198888778687, + "learning_rate": 9.405689986838109e-05, + "loss": 2.1262, + "step": 1184 + }, + { + "epoch": 0.3596903930793747, + "grad_norm": 0.3604430556297302, + "learning_rate": 9.405183760251088e-05, + "loss": 1.9715, + "step": 1185 + }, + { + "epoch": 0.35999392927606616, + "grad_norm": 0.46226024627685547, + "learning_rate": 9.404677533664068e-05, + "loss": 1.7088, + "step": 1186 + }, + { + "epoch": 0.36029746547275765, + "grad_norm": 0.3673461377620697, + "learning_rate": 9.404171307077047e-05, + "loss": 1.7057, + "step": 1187 + }, + { + "epoch": 0.3606010016694491, + "grad_norm": 0.40370312333106995, + "learning_rate": 9.403665080490028e-05, + "loss": 1.9058, + "step": 1188 + }, + { + "epoch": 0.3609045378661405, + "grad_norm": 0.39149123430252075, + "learning_rate": 9.403158853903007e-05, + "loss": 2.0148, + "step": 1189 + }, + { + "epoch": 0.361208074062832, + "grad_norm": 0.6711376309394836, + "learning_rate": 9.402652627315988e-05, + "loss": 1.69, + "step": 1190 + }, + { + "epoch": 0.36151161025952344, + "grad_norm": 0.3052380084991455, + "learning_rate": 9.402146400728968e-05, + "loss": 1.4772, + "step": 1191 + }, + { + "epoch": 0.36181514645621493, + "grad_norm": 0.37661212682724, + "learning_rate": 9.401640174141947e-05, + "loss": 1.7378, + "step": 1192 + }, + { + "epoch": 0.36211868265290637, + "grad_norm": 0.39574167132377625, + "learning_rate": 9.401133947554927e-05, + "loss": 1.801, + "step": 1193 + }, + { + "epoch": 0.3624222188495978, + "grad_norm": 0.44611817598342896, + "learning_rate": 9.400627720967906e-05, + "loss": 1.5995, + "step": 1194 + }, + { + "epoch": 0.3627257550462893, + "grad_norm": 0.40026605129241943, + "learning_rate": 9.400121494380886e-05, + "loss": 1.6517, + "step": 1195 + }, + { + "epoch": 0.3630292912429807, + "grad_norm": 0.36110207438468933, + "learning_rate": 9.399615267793865e-05, + "loss": 1.9764, + "step": 1196 + }, + { + "epoch": 0.36333282743967216, + "grad_norm": 0.38339897990226746, + "learning_rate": 9.399109041206845e-05, + "loss": 2.085, + "step": 1197 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.36159849166870117, + "learning_rate": 9.398602814619824e-05, + "loss": 1.7839, + "step": 1198 + }, + { + "epoch": 0.3639398998330551, + "grad_norm": 0.3263375461101532, + "learning_rate": 9.398096588032805e-05, + "loss": 1.8143, + "step": 1199 + }, + { + "epoch": 0.36424343602974657, + "grad_norm": 0.3886968195438385, + "learning_rate": 9.397590361445784e-05, + "loss": 2.0904, + "step": 1200 + }, + { + "epoch": 0.364546972226438, + "grad_norm": 0.41123297810554504, + "learning_rate": 9.397084134858764e-05, + "loss": 1.8622, + "step": 1201 + }, + { + "epoch": 0.36485050842312944, + "grad_norm": 0.580788791179657, + "learning_rate": 9.396577908271743e-05, + "loss": 1.9017, + "step": 1202 + }, + { + "epoch": 0.36515404461982093, + "grad_norm": 0.3737773895263672, + "learning_rate": 9.396071681684723e-05, + "loss": 1.5775, + "step": 1203 + }, + { + "epoch": 0.36545758081651236, + "grad_norm": 0.38713717460632324, + "learning_rate": 9.395565455097702e-05, + "loss": 1.967, + "step": 1204 + }, + { + "epoch": 0.36576111701320385, + "grad_norm": 0.7311956882476807, + "learning_rate": 9.395059228510682e-05, + "loss": 1.7088, + "step": 1205 + }, + { + "epoch": 0.3660646532098953, + "grad_norm": 1.4061527252197266, + "learning_rate": 9.394553001923661e-05, + "loss": 1.9163, + "step": 1206 + }, + { + "epoch": 0.3663681894065867, + "grad_norm": 0.3753696382045746, + "learning_rate": 9.39404677533664e-05, + "loss": 2.1954, + "step": 1207 + }, + { + "epoch": 0.3666717256032782, + "grad_norm": 0.38732466101646423, + "learning_rate": 9.393540548749622e-05, + "loss": 1.9101, + "step": 1208 + }, + { + "epoch": 0.36697526179996964, + "grad_norm": 0.41291502118110657, + "learning_rate": 9.393034322162601e-05, + "loss": 1.9483, + "step": 1209 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.44216427206993103, + "learning_rate": 9.39252809557558e-05, + "loss": 2.043, + "step": 1210 + }, + { + "epoch": 0.36758233419335257, + "grad_norm": 0.798313319683075, + "learning_rate": 9.39202186898856e-05, + "loss": 1.9988, + "step": 1211 + }, + { + "epoch": 0.367885870390044, + "grad_norm": 0.5483587384223938, + "learning_rate": 9.39151564240154e-05, + "loss": 2.0979, + "step": 1212 + }, + { + "epoch": 0.3681894065867355, + "grad_norm": 0.44406580924987793, + "learning_rate": 9.391009415814519e-05, + "loss": 1.5858, + "step": 1213 + }, + { + "epoch": 0.3684929427834269, + "grad_norm": 0.3883718252182007, + "learning_rate": 9.390503189227498e-05, + "loss": 1.9014, + "step": 1214 + }, + { + "epoch": 0.36879647898011836, + "grad_norm": 0.7284543514251709, + "learning_rate": 9.389996962640478e-05, + "loss": 1.9709, + "step": 1215 + }, + { + "epoch": 0.36910001517680985, + "grad_norm": 0.38549402356147766, + "learning_rate": 9.389490736053457e-05, + "loss": 1.9513, + "step": 1216 + }, + { + "epoch": 0.3694035513735013, + "grad_norm": 0.39417389035224915, + "learning_rate": 9.388984509466437e-05, + "loss": 2.0409, + "step": 1217 + }, + { + "epoch": 0.36970708757019277, + "grad_norm": 0.40816301107406616, + "learning_rate": 9.388478282879418e-05, + "loss": 2.0136, + "step": 1218 + }, + { + "epoch": 0.3700106237668842, + "grad_norm": 0.5700183510780334, + "learning_rate": 9.387972056292397e-05, + "loss": 1.8478, + "step": 1219 + }, + { + "epoch": 0.37031415996357564, + "grad_norm": 0.35159793496131897, + "learning_rate": 9.387465829705377e-05, + "loss": 1.6004, + "step": 1220 + }, + { + "epoch": 0.37061769616026713, + "grad_norm": 0.41622206568717957, + "learning_rate": 9.386959603118356e-05, + "loss": 1.9104, + "step": 1221 + }, + { + "epoch": 0.37092123235695856, + "grad_norm": 0.4205602705478668, + "learning_rate": 9.386453376531336e-05, + "loss": 2.1058, + "step": 1222 + }, + { + "epoch": 0.37122476855365, + "grad_norm": 0.38390764594078064, + "learning_rate": 9.385947149944315e-05, + "loss": 1.8972, + "step": 1223 + }, + { + "epoch": 0.3715283047503415, + "grad_norm": 0.3790401816368103, + "learning_rate": 9.385440923357295e-05, + "loss": 1.8975, + "step": 1224 + }, + { + "epoch": 0.3718318409470329, + "grad_norm": 0.5210400223731995, + "learning_rate": 9.384934696770274e-05, + "loss": 1.7181, + "step": 1225 + }, + { + "epoch": 0.3721353771437244, + "grad_norm": 0.4098454415798187, + "learning_rate": 9.384428470183253e-05, + "loss": 2.1405, + "step": 1226 + }, + { + "epoch": 0.37243891334041584, + "grad_norm": 0.40917104482650757, + "learning_rate": 9.383922243596234e-05, + "loss": 1.8696, + "step": 1227 + }, + { + "epoch": 0.3727424495371073, + "grad_norm": 0.3712831139564514, + "learning_rate": 9.383416017009214e-05, + "loss": 1.8792, + "step": 1228 + }, + { + "epoch": 0.37304598573379877, + "grad_norm": 0.3110792934894562, + "learning_rate": 9.382909790422193e-05, + "loss": 1.5782, + "step": 1229 + }, + { + "epoch": 0.3733495219304902, + "grad_norm": 0.3657875061035156, + "learning_rate": 9.382403563835173e-05, + "loss": 2.0311, + "step": 1230 + }, + { + "epoch": 0.3736530581271817, + "grad_norm": 0.37432965636253357, + "learning_rate": 9.381897337248152e-05, + "loss": 1.8505, + "step": 1231 + }, + { + "epoch": 0.3739565943238731, + "grad_norm": 0.3771384656429291, + "learning_rate": 9.381391110661132e-05, + "loss": 1.5715, + "step": 1232 + }, + { + "epoch": 0.37426013052056456, + "grad_norm": 1.2401721477508545, + "learning_rate": 9.380884884074111e-05, + "loss": 1.4436, + "step": 1233 + }, + { + "epoch": 0.37456366671725605, + "grad_norm": 0.36102503538131714, + "learning_rate": 9.380378657487092e-05, + "loss": 1.8907, + "step": 1234 + }, + { + "epoch": 0.3748672029139475, + "grad_norm": 0.46541303396224976, + "learning_rate": 9.379872430900072e-05, + "loss": 2.0067, + "step": 1235 + }, + { + "epoch": 0.3751707391106389, + "grad_norm": 0.46490392088890076, + "learning_rate": 9.379366204313051e-05, + "loss": 1.379, + "step": 1236 + }, + { + "epoch": 0.3754742753073304, + "grad_norm": 0.40038684010505676, + "learning_rate": 9.37885997772603e-05, + "loss": 1.9102, + "step": 1237 + }, + { + "epoch": 0.37577781150402184, + "grad_norm": 0.401563435792923, + "learning_rate": 9.378353751139011e-05, + "loss": 2.0087, + "step": 1238 + }, + { + "epoch": 0.37608134770071333, + "grad_norm": 0.38930457830429077, + "learning_rate": 9.377847524551991e-05, + "loss": 1.7222, + "step": 1239 + }, + { + "epoch": 0.37638488389740477, + "grad_norm": 0.4146344065666199, + "learning_rate": 9.37734129796497e-05, + "loss": 1.9965, + "step": 1240 + }, + { + "epoch": 0.3766884200940962, + "grad_norm": 0.7829983830451965, + "learning_rate": 9.37683507137795e-05, + "loss": 1.7133, + "step": 1241 + }, + { + "epoch": 0.3769919562907877, + "grad_norm": 0.3819306492805481, + "learning_rate": 9.376328844790929e-05, + "loss": 1.7532, + "step": 1242 + }, + { + "epoch": 0.3772954924874791, + "grad_norm": 0.35361188650131226, + "learning_rate": 9.375822618203909e-05, + "loss": 1.8794, + "step": 1243 + }, + { + "epoch": 0.3775990286841706, + "grad_norm": 0.37844938039779663, + "learning_rate": 9.375316391616888e-05, + "loss": 2.0991, + "step": 1244 + }, + { + "epoch": 0.37790256488086205, + "grad_norm": 0.49530112743377686, + "learning_rate": 9.374810165029868e-05, + "loss": 1.5838, + "step": 1245 + }, + { + "epoch": 0.3782061010775535, + "grad_norm": 0.36716628074645996, + "learning_rate": 9.374303938442847e-05, + "loss": 1.812, + "step": 1246 + }, + { + "epoch": 0.37850963727424497, + "grad_norm": 0.3772716522216797, + "learning_rate": 9.373797711855828e-05, + "loss": 1.8649, + "step": 1247 + }, + { + "epoch": 0.3788131734709364, + "grad_norm": 0.42215248942375183, + "learning_rate": 9.373291485268807e-05, + "loss": 1.8589, + "step": 1248 + }, + { + "epoch": 0.37911670966762784, + "grad_norm": 0.4086074233055115, + "learning_rate": 9.372785258681787e-05, + "loss": 2.0305, + "step": 1249 + }, + { + "epoch": 0.3794202458643193, + "grad_norm": 0.5096133947372437, + "learning_rate": 9.372279032094766e-05, + "loss": 2.0852, + "step": 1250 + }, + { + "epoch": 0.37972378206101076, + "grad_norm": 0.41633352637290955, + "learning_rate": 9.371772805507746e-05, + "loss": 1.8879, + "step": 1251 + }, + { + "epoch": 0.38002731825770225, + "grad_norm": 0.4787557125091553, + "learning_rate": 9.371266578920725e-05, + "loss": 1.9307, + "step": 1252 + }, + { + "epoch": 0.3803308544543937, + "grad_norm": 0.4313805103302002, + "learning_rate": 9.370760352333705e-05, + "loss": 1.097, + "step": 1253 + }, + { + "epoch": 0.3806343906510851, + "grad_norm": 0.3604517877101898, + "learning_rate": 9.370254125746684e-05, + "loss": 1.9466, + "step": 1254 + }, + { + "epoch": 0.3809379268477766, + "grad_norm": 0.35350343585014343, + "learning_rate": 9.369747899159664e-05, + "loss": 2.1093, + "step": 1255 + }, + { + "epoch": 0.38124146304446804, + "grad_norm": 0.43002399802207947, + "learning_rate": 9.369241672572643e-05, + "loss": 1.9016, + "step": 1256 + }, + { + "epoch": 0.38154499924115953, + "grad_norm": 0.46702131628990173, + "learning_rate": 9.368735445985624e-05, + "loss": 1.9909, + "step": 1257 + }, + { + "epoch": 0.38184853543785097, + "grad_norm": 0.42195767164230347, + "learning_rate": 9.368229219398604e-05, + "loss": 1.9486, + "step": 1258 + }, + { + "epoch": 0.3821520716345424, + "grad_norm": 0.4160800874233246, + "learning_rate": 9.367722992811583e-05, + "loss": 1.2547, + "step": 1259 + }, + { + "epoch": 0.3824556078312339, + "grad_norm": 0.398027628660202, + "learning_rate": 9.367216766224563e-05, + "loss": 1.7109, + "step": 1260 + }, + { + "epoch": 0.3827591440279253, + "grad_norm": 0.35801073908805847, + "learning_rate": 9.366710539637542e-05, + "loss": 1.7718, + "step": 1261 + }, + { + "epoch": 0.38306268022461676, + "grad_norm": 0.3769727647304535, + "learning_rate": 9.366204313050522e-05, + "loss": 1.7201, + "step": 1262 + }, + { + "epoch": 0.38336621642130825, + "grad_norm": 0.4340580105781555, + "learning_rate": 9.365698086463501e-05, + "loss": 1.5747, + "step": 1263 + }, + { + "epoch": 0.3836697526179997, + "grad_norm": 0.48839592933654785, + "learning_rate": 9.36519185987648e-05, + "loss": 2.0381, + "step": 1264 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.3686861991882324, + "learning_rate": 9.36468563328946e-05, + "loss": 1.8126, + "step": 1265 + }, + { + "epoch": 0.3842768250113826, + "grad_norm": 0.45264241099357605, + "learning_rate": 9.364179406702441e-05, + "loss": 1.7206, + "step": 1266 + }, + { + "epoch": 0.38458036120807404, + "grad_norm": 0.45419684052467346, + "learning_rate": 9.36367318011542e-05, + "loss": 2.028, + "step": 1267 + }, + { + "epoch": 0.38488389740476553, + "grad_norm": 0.38863110542297363, + "learning_rate": 9.3631669535284e-05, + "loss": 1.8139, + "step": 1268 + }, + { + "epoch": 0.38518743360145696, + "grad_norm": 0.41982683539390564, + "learning_rate": 9.362660726941379e-05, + "loss": 1.9528, + "step": 1269 + }, + { + "epoch": 0.38549096979814845, + "grad_norm": 0.3579862415790558, + "learning_rate": 9.362154500354359e-05, + "loss": 2.1949, + "step": 1270 + }, + { + "epoch": 0.3857945059948399, + "grad_norm": 0.39172133803367615, + "learning_rate": 9.361648273767338e-05, + "loss": 1.8242, + "step": 1271 + }, + { + "epoch": 0.3860980421915313, + "grad_norm": 0.36367735266685486, + "learning_rate": 9.361142047180318e-05, + "loss": 1.9508, + "step": 1272 + }, + { + "epoch": 0.3864015783882228, + "grad_norm": 0.3536215126514435, + "learning_rate": 9.360635820593297e-05, + "loss": 1.7761, + "step": 1273 + }, + { + "epoch": 0.38670511458491424, + "grad_norm": 0.44467857480049133, + "learning_rate": 9.360129594006277e-05, + "loss": 2.2006, + "step": 1274 + }, + { + "epoch": 0.38700865078160573, + "grad_norm": 0.41492581367492676, + "learning_rate": 9.359623367419257e-05, + "loss": 1.7899, + "step": 1275 + }, + { + "epoch": 0.38731218697829717, + "grad_norm": 0.4128611087799072, + "learning_rate": 9.359117140832237e-05, + "loss": 1.9787, + "step": 1276 + }, + { + "epoch": 0.3876157231749886, + "grad_norm": 0.36134451627731323, + "learning_rate": 9.358610914245216e-05, + "loss": 1.941, + "step": 1277 + }, + { + "epoch": 0.3879192593716801, + "grad_norm": 0.36279958486557007, + "learning_rate": 9.358104687658197e-05, + "loss": 2.1067, + "step": 1278 + }, + { + "epoch": 0.3882227955683715, + "grad_norm": 0.432478666305542, + "learning_rate": 9.357598461071177e-05, + "loss": 2.1264, + "step": 1279 + }, + { + "epoch": 0.38852633176506296, + "grad_norm": 0.3920331299304962, + "learning_rate": 9.357092234484156e-05, + "loss": 1.5522, + "step": 1280 + }, + { + "epoch": 0.38882986796175445, + "grad_norm": 0.3537754416465759, + "learning_rate": 9.356586007897136e-05, + "loss": 1.8354, + "step": 1281 + }, + { + "epoch": 0.3891334041584459, + "grad_norm": 0.40271031856536865, + "learning_rate": 9.356079781310115e-05, + "loss": 1.7567, + "step": 1282 + }, + { + "epoch": 0.3894369403551374, + "grad_norm": 0.47157374024391174, + "learning_rate": 9.355573554723095e-05, + "loss": 1.8542, + "step": 1283 + }, + { + "epoch": 0.3897404765518288, + "grad_norm": 0.3331926167011261, + "learning_rate": 9.355067328136074e-05, + "loss": 1.8651, + "step": 1284 + }, + { + "epoch": 0.39004401274852024, + "grad_norm": 0.884087860584259, + "learning_rate": 9.354561101549054e-05, + "loss": 1.4765, + "step": 1285 + }, + { + "epoch": 0.39034754894521173, + "grad_norm": 0.3618917167186737, + "learning_rate": 9.354054874962034e-05, + "loss": 2.0908, + "step": 1286 + }, + { + "epoch": 0.39065108514190316, + "grad_norm": 0.3494134843349457, + "learning_rate": 9.353548648375014e-05, + "loss": 1.1975, + "step": 1287 + }, + { + "epoch": 0.39095462133859465, + "grad_norm": 0.40450137853622437, + "learning_rate": 9.353042421787993e-05, + "loss": 1.5567, + "step": 1288 + }, + { + "epoch": 0.3912581575352861, + "grad_norm": 0.3893278241157532, + "learning_rate": 9.352536195200973e-05, + "loss": 1.6993, + "step": 1289 + }, + { + "epoch": 0.3915616937319775, + "grad_norm": 0.6020291447639465, + "learning_rate": 9.352029968613952e-05, + "loss": 1.7073, + "step": 1290 + }, + { + "epoch": 0.391865229928669, + "grad_norm": 0.43949219584465027, + "learning_rate": 9.351523742026932e-05, + "loss": 1.8407, + "step": 1291 + }, + { + "epoch": 0.39216876612536045, + "grad_norm": 0.41567811369895935, + "learning_rate": 9.351017515439911e-05, + "loss": 2.0354, + "step": 1292 + }, + { + "epoch": 0.3924723023220519, + "grad_norm": 0.41198036074638367, + "learning_rate": 9.350511288852891e-05, + "loss": 1.964, + "step": 1293 + }, + { + "epoch": 0.39277583851874337, + "grad_norm": 0.3735191524028778, + "learning_rate": 9.35000506226587e-05, + "loss": 1.8359, + "step": 1294 + }, + { + "epoch": 0.3930793747154348, + "grad_norm": 0.4426116347312927, + "learning_rate": 9.34949883567885e-05, + "loss": 1.8876, + "step": 1295 + }, + { + "epoch": 0.3933829109121263, + "grad_norm": 0.3956250548362732, + "learning_rate": 9.34899260909183e-05, + "loss": 1.5177, + "step": 1296 + }, + { + "epoch": 0.3936864471088177, + "grad_norm": 0.3534790575504303, + "learning_rate": 9.34848638250481e-05, + "loss": 2.1419, + "step": 1297 + }, + { + "epoch": 0.39398998330550916, + "grad_norm": 0.4134576916694641, + "learning_rate": 9.34798015591779e-05, + "loss": 1.5873, + "step": 1298 + }, + { + "epoch": 0.39429351950220065, + "grad_norm": 0.4386560916900635, + "learning_rate": 9.347473929330769e-05, + "loss": 1.4547, + "step": 1299 + }, + { + "epoch": 0.3945970556988921, + "grad_norm": 0.41839587688446045, + "learning_rate": 9.346967702743749e-05, + "loss": 1.5251, + "step": 1300 + }, + { + "epoch": 0.3949005918955836, + "grad_norm": 0.333609938621521, + "learning_rate": 9.346461476156728e-05, + "loss": 1.5575, + "step": 1301 + }, + { + "epoch": 0.395204128092275, + "grad_norm": 0.4706360101699829, + "learning_rate": 9.345955249569707e-05, + "loss": 1.8686, + "step": 1302 + }, + { + "epoch": 0.39550766428896644, + "grad_norm": 0.3555939495563507, + "learning_rate": 9.345449022982687e-05, + "loss": 1.5205, + "step": 1303 + }, + { + "epoch": 0.39581120048565793, + "grad_norm": 0.47611120343208313, + "learning_rate": 9.344942796395666e-05, + "loss": 1.5114, + "step": 1304 + }, + { + "epoch": 0.39611473668234937, + "grad_norm": 0.570785641670227, + "learning_rate": 9.344436569808647e-05, + "loss": 1.9987, + "step": 1305 + }, + { + "epoch": 0.3964182728790408, + "grad_norm": 0.3685778081417084, + "learning_rate": 9.343930343221627e-05, + "loss": 1.9471, + "step": 1306 + }, + { + "epoch": 0.3967218090757323, + "grad_norm": 0.4187014698982239, + "learning_rate": 9.343424116634606e-05, + "loss": 2.1728, + "step": 1307 + }, + { + "epoch": 0.3970253452724237, + "grad_norm": 0.35904020071029663, + "learning_rate": 9.342917890047586e-05, + "loss": 1.9576, + "step": 1308 + }, + { + "epoch": 0.3973288814691152, + "grad_norm": 0.48214206099510193, + "learning_rate": 9.342411663460565e-05, + "loss": 1.7529, + "step": 1309 + }, + { + "epoch": 0.39763241766580665, + "grad_norm": 0.3852714002132416, + "learning_rate": 9.341905436873545e-05, + "loss": 2.256, + "step": 1310 + }, + { + "epoch": 0.3979359538624981, + "grad_norm": 0.44712984561920166, + "learning_rate": 9.341399210286524e-05, + "loss": 1.8981, + "step": 1311 + }, + { + "epoch": 0.39823949005918957, + "grad_norm": 0.42379963397979736, + "learning_rate": 9.340892983699504e-05, + "loss": 2.0528, + "step": 1312 + }, + { + "epoch": 0.398543026255881, + "grad_norm": 0.3936759829521179, + "learning_rate": 9.340386757112483e-05, + "loss": 1.592, + "step": 1313 + }, + { + "epoch": 0.3988465624525725, + "grad_norm": 0.4035021662712097, + "learning_rate": 9.339880530525464e-05, + "loss": 2.0751, + "step": 1314 + }, + { + "epoch": 0.39915009864926393, + "grad_norm": 0.3658972382545471, + "learning_rate": 9.339374303938443e-05, + "loss": 1.7568, + "step": 1315 + }, + { + "epoch": 0.39945363484595536, + "grad_norm": 0.4271409511566162, + "learning_rate": 9.338868077351423e-05, + "loss": 2.1243, + "step": 1316 + }, + { + "epoch": 0.39975717104264685, + "grad_norm": 0.3799911439418793, + "learning_rate": 9.338361850764402e-05, + "loss": 1.9763, + "step": 1317 + }, + { + "epoch": 0.4000607072393383, + "grad_norm": 0.3878629803657532, + "learning_rate": 9.337855624177382e-05, + "loss": 1.6328, + "step": 1318 + }, + { + "epoch": 0.4003642434360297, + "grad_norm": 0.3611898124217987, + "learning_rate": 9.337349397590361e-05, + "loss": 1.8017, + "step": 1319 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.4010205864906311, + "learning_rate": 9.336843171003341e-05, + "loss": 2.1213, + "step": 1320 + }, + { + "epoch": 0.40097131582941264, + "grad_norm": 0.4076247811317444, + "learning_rate": 9.33633694441632e-05, + "loss": 1.8366, + "step": 1321 + }, + { + "epoch": 0.40127485202610413, + "grad_norm": 0.4172746241092682, + "learning_rate": 9.3358307178293e-05, + "loss": 1.7759, + "step": 1322 + }, + { + "epoch": 0.40157838822279557, + "grad_norm": 0.6179870367050171, + "learning_rate": 9.33532449124228e-05, + "loss": 1.7803, + "step": 1323 + }, + { + "epoch": 0.401881924419487, + "grad_norm": 0.38707882165908813, + "learning_rate": 9.33481826465526e-05, + "loss": 1.2795, + "step": 1324 + }, + { + "epoch": 0.4021854606161785, + "grad_norm": 0.35764575004577637, + "learning_rate": 9.334312038068241e-05, + "loss": 1.3463, + "step": 1325 + }, + { + "epoch": 0.4024889968128699, + "grad_norm": 0.40050292015075684, + "learning_rate": 9.33380581148122e-05, + "loss": 1.8487, + "step": 1326 + }, + { + "epoch": 0.4027925330095614, + "grad_norm": 0.5421705842018127, + "learning_rate": 9.3332995848942e-05, + "loss": 2.0407, + "step": 1327 + }, + { + "epoch": 0.40309606920625285, + "grad_norm": 0.5423186421394348, + "learning_rate": 9.33279335830718e-05, + "loss": 1.6743, + "step": 1328 + }, + { + "epoch": 0.4033996054029443, + "grad_norm": 0.41429242491722107, + "learning_rate": 9.332287131720159e-05, + "loss": 2.1614, + "step": 1329 + }, + { + "epoch": 0.4037031415996358, + "grad_norm": 0.41197100281715393, + "learning_rate": 9.331780905133138e-05, + "loss": 1.9414, + "step": 1330 + }, + { + "epoch": 0.4040066777963272, + "grad_norm": 0.3833538293838501, + "learning_rate": 9.331274678546118e-05, + "loss": 1.4602, + "step": 1331 + }, + { + "epoch": 0.40431021399301864, + "grad_norm": 0.4118226170539856, + "learning_rate": 9.330768451959097e-05, + "loss": 2.0595, + "step": 1332 + }, + { + "epoch": 0.40461375018971013, + "grad_norm": 0.3417702317237854, + "learning_rate": 9.330262225372077e-05, + "loss": 1.5938, + "step": 1333 + }, + { + "epoch": 0.40491728638640156, + "grad_norm": 0.3822105824947357, + "learning_rate": 9.329755998785056e-05, + "loss": 1.9013, + "step": 1334 + }, + { + "epoch": 0.40522082258309305, + "grad_norm": 0.7788810133934021, + "learning_rate": 9.329249772198037e-05, + "loss": 1.6752, + "step": 1335 + }, + { + "epoch": 0.4055243587797845, + "grad_norm": 0.4163956940174103, + "learning_rate": 9.328743545611017e-05, + "loss": 1.7016, + "step": 1336 + }, + { + "epoch": 0.4058278949764759, + "grad_norm": 0.42450758814811707, + "learning_rate": 9.328237319023996e-05, + "loss": 1.831, + "step": 1337 + }, + { + "epoch": 0.4061314311731674, + "grad_norm": 0.4169425666332245, + "learning_rate": 9.327731092436976e-05, + "loss": 1.7361, + "step": 1338 + }, + { + "epoch": 0.40643496736985885, + "grad_norm": 0.3413407802581787, + "learning_rate": 9.327224865849955e-05, + "loss": 2.0173, + "step": 1339 + }, + { + "epoch": 0.40673850356655034, + "grad_norm": 0.3989046812057495, + "learning_rate": 9.326718639262934e-05, + "loss": 1.9196, + "step": 1340 + }, + { + "epoch": 0.40704203976324177, + "grad_norm": 0.47707435488700867, + "learning_rate": 9.326212412675914e-05, + "loss": 1.9119, + "step": 1341 + }, + { + "epoch": 0.4073455759599332, + "grad_norm": 0.3998529314994812, + "learning_rate": 9.325706186088893e-05, + "loss": 1.8923, + "step": 1342 + }, + { + "epoch": 0.4076491121566247, + "grad_norm": 0.3560973107814789, + "learning_rate": 9.325199959501873e-05, + "loss": 1.8033, + "step": 1343 + }, + { + "epoch": 0.4079526483533161, + "grad_norm": 0.42655158042907715, + "learning_rate": 9.324693732914854e-05, + "loss": 1.5517, + "step": 1344 + }, + { + "epoch": 0.40825618455000756, + "grad_norm": 0.4044337272644043, + "learning_rate": 9.324187506327833e-05, + "loss": 1.8332, + "step": 1345 + }, + { + "epoch": 0.40855972074669905, + "grad_norm": 0.382467120885849, + "learning_rate": 9.323681279740813e-05, + "loss": 2.0407, + "step": 1346 + }, + { + "epoch": 0.4088632569433905, + "grad_norm": 0.46734219789505005, + "learning_rate": 9.323175053153792e-05, + "loss": 1.5662, + "step": 1347 + }, + { + "epoch": 0.409166793140082, + "grad_norm": 0.45105868577957153, + "learning_rate": 9.322668826566772e-05, + "loss": 2.0506, + "step": 1348 + }, + { + "epoch": 0.4094703293367734, + "grad_norm": 0.3531922399997711, + "learning_rate": 9.322162599979751e-05, + "loss": 1.7494, + "step": 1349 + }, + { + "epoch": 0.40977386553346484, + "grad_norm": 0.3707609474658966, + "learning_rate": 9.32165637339273e-05, + "loss": 2.2804, + "step": 1350 + }, + { + "epoch": 0.41007740173015633, + "grad_norm": 0.38254693150520325, + "learning_rate": 9.32115014680571e-05, + "loss": 1.9183, + "step": 1351 + }, + { + "epoch": 0.41038093792684777, + "grad_norm": 0.41418614983558655, + "learning_rate": 9.32064392021869e-05, + "loss": 1.8523, + "step": 1352 + }, + { + "epoch": 0.41068447412353926, + "grad_norm": 0.42098134756088257, + "learning_rate": 9.32013769363167e-05, + "loss": 1.712, + "step": 1353 + }, + { + "epoch": 0.4109880103202307, + "grad_norm": 0.3387204706668854, + "learning_rate": 9.31963146704465e-05, + "loss": 1.7652, + "step": 1354 + }, + { + "epoch": 0.4112915465169221, + "grad_norm": 0.4330706000328064, + "learning_rate": 9.31912524045763e-05, + "loss": 1.478, + "step": 1355 + }, + { + "epoch": 0.4115950827136136, + "grad_norm": 0.36673831939697266, + "learning_rate": 9.318619013870609e-05, + "loss": 1.7798, + "step": 1356 + }, + { + "epoch": 0.41189861891030505, + "grad_norm": 0.40374481678009033, + "learning_rate": 9.318112787283588e-05, + "loss": 1.6161, + "step": 1357 + }, + { + "epoch": 0.4122021551069965, + "grad_norm": 0.38840124011039734, + "learning_rate": 9.317606560696568e-05, + "loss": 1.3431, + "step": 1358 + }, + { + "epoch": 0.41250569130368797, + "grad_norm": 0.4768214225769043, + "learning_rate": 9.317100334109547e-05, + "loss": 1.6115, + "step": 1359 + }, + { + "epoch": 0.4128092275003794, + "grad_norm": 0.43069908022880554, + "learning_rate": 9.316594107522527e-05, + "loss": 2.0131, + "step": 1360 + }, + { + "epoch": 0.4131127636970709, + "grad_norm": 0.36959967017173767, + "learning_rate": 9.316087880935506e-05, + "loss": 1.982, + "step": 1361 + }, + { + "epoch": 0.41341629989376233, + "grad_norm": 0.3068915009498596, + "learning_rate": 9.315581654348486e-05, + "loss": 1.8105, + "step": 1362 + }, + { + "epoch": 0.41371983609045376, + "grad_norm": 0.33738118410110474, + "learning_rate": 9.315075427761467e-05, + "loss": 1.6039, + "step": 1363 + }, + { + "epoch": 0.41402337228714525, + "grad_norm": 0.38889479637145996, + "learning_rate": 9.314569201174446e-05, + "loss": 1.7897, + "step": 1364 + }, + { + "epoch": 0.4143269084838367, + "grad_norm": 0.35099512338638306, + "learning_rate": 9.314062974587426e-05, + "loss": 1.5897, + "step": 1365 + }, + { + "epoch": 0.4146304446805282, + "grad_norm": 0.3819596767425537, + "learning_rate": 9.313556748000405e-05, + "loss": 1.4781, + "step": 1366 + }, + { + "epoch": 0.4149339808772196, + "grad_norm": 0.392493337392807, + "learning_rate": 9.313050521413386e-05, + "loss": 1.8577, + "step": 1367 + }, + { + "epoch": 0.41523751707391104, + "grad_norm": 0.34424975514411926, + "learning_rate": 9.312544294826365e-05, + "loss": 1.4452, + "step": 1368 + }, + { + "epoch": 0.41554105327060253, + "grad_norm": 0.44334256649017334, + "learning_rate": 9.312038068239345e-05, + "loss": 1.5221, + "step": 1369 + }, + { + "epoch": 0.41584458946729397, + "grad_norm": 0.4194605350494385, + "learning_rate": 9.311531841652324e-05, + "loss": 2.3915, + "step": 1370 + }, + { + "epoch": 0.41614812566398546, + "grad_norm": 0.33700132369995117, + "learning_rate": 9.311025615065304e-05, + "loss": 1.9193, + "step": 1371 + }, + { + "epoch": 0.4164516618606769, + "grad_norm": 0.4527650773525238, + "learning_rate": 9.310519388478283e-05, + "loss": 1.766, + "step": 1372 + }, + { + "epoch": 0.4167551980573683, + "grad_norm": 0.3435012996196747, + "learning_rate": 9.310013161891263e-05, + "loss": 1.8662, + "step": 1373 + }, + { + "epoch": 0.4170587342540598, + "grad_norm": 0.3468983471393585, + "learning_rate": 9.309506935304244e-05, + "loss": 1.5261, + "step": 1374 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38368481397628784, + "learning_rate": 9.309000708717223e-05, + "loss": 1.6389, + "step": 1375 + }, + { + "epoch": 0.4176658066474427, + "grad_norm": 0.38153746724128723, + "learning_rate": 9.308494482130203e-05, + "loss": 1.7431, + "step": 1376 + }, + { + "epoch": 0.4179693428441342, + "grad_norm": 0.4192492961883545, + "learning_rate": 9.307988255543182e-05, + "loss": 1.8582, + "step": 1377 + }, + { + "epoch": 0.4182728790408256, + "grad_norm": 0.42689767479896545, + "learning_rate": 9.307482028956161e-05, + "loss": 1.9049, + "step": 1378 + }, + { + "epoch": 0.4185764152375171, + "grad_norm": 0.38545602560043335, + "learning_rate": 9.306975802369141e-05, + "loss": 1.2598, + "step": 1379 + }, + { + "epoch": 0.41887995143420853, + "grad_norm": 0.4117288887500763, + "learning_rate": 9.30646957578212e-05, + "loss": 1.9808, + "step": 1380 + }, + { + "epoch": 0.41918348763089996, + "grad_norm": 0.38102084398269653, + "learning_rate": 9.3059633491951e-05, + "loss": 1.9734, + "step": 1381 + }, + { + "epoch": 0.41948702382759145, + "grad_norm": 0.3788716495037079, + "learning_rate": 9.30545712260808e-05, + "loss": 1.9655, + "step": 1382 + }, + { + "epoch": 0.4197905600242829, + "grad_norm": 1.5338399410247803, + "learning_rate": 9.30495089602106e-05, + "loss": 2.1111, + "step": 1383 + }, + { + "epoch": 0.4200940962209744, + "grad_norm": 0.40994685888290405, + "learning_rate": 9.30444466943404e-05, + "loss": 1.9063, + "step": 1384 + }, + { + "epoch": 0.4203976324176658, + "grad_norm": 0.3389085829257965, + "learning_rate": 9.303938442847019e-05, + "loss": 1.8592, + "step": 1385 + }, + { + "epoch": 0.42070116861435725, + "grad_norm": 1.0117053985595703, + "learning_rate": 9.303432216259999e-05, + "loss": 1.4198, + "step": 1386 + }, + { + "epoch": 0.42100470481104874, + "grad_norm": 0.37429583072662354, + "learning_rate": 9.302925989672978e-05, + "loss": 1.6439, + "step": 1387 + }, + { + "epoch": 0.42130824100774017, + "grad_norm": 0.397991806268692, + "learning_rate": 9.302419763085958e-05, + "loss": 1.9744, + "step": 1388 + }, + { + "epoch": 0.4216117772044316, + "grad_norm": 0.39546629786491394, + "learning_rate": 9.301913536498937e-05, + "loss": 1.993, + "step": 1389 + }, + { + "epoch": 0.4219153134011231, + "grad_norm": 0.3465210497379303, + "learning_rate": 9.301407309911917e-05, + "loss": 1.8254, + "step": 1390 + }, + { + "epoch": 0.4222188495978145, + "grad_norm": 0.36281952261924744, + "learning_rate": 9.300901083324896e-05, + "loss": 1.9205, + "step": 1391 + }, + { + "epoch": 0.422522385794506, + "grad_norm": 0.37978988885879517, + "learning_rate": 9.300394856737877e-05, + "loss": 1.8021, + "step": 1392 + }, + { + "epoch": 0.42282592199119745, + "grad_norm": 0.3463260531425476, + "learning_rate": 9.299888630150856e-05, + "loss": 2.1022, + "step": 1393 + }, + { + "epoch": 0.4231294581878889, + "grad_norm": 0.3449305593967438, + "learning_rate": 9.299382403563836e-05, + "loss": 1.808, + "step": 1394 + }, + { + "epoch": 0.4234329943845804, + "grad_norm": 0.3900066018104553, + "learning_rate": 9.298876176976815e-05, + "loss": 1.8926, + "step": 1395 + }, + { + "epoch": 0.4237365305812718, + "grad_norm": 0.3958972692489624, + "learning_rate": 9.298369950389795e-05, + "loss": 1.7716, + "step": 1396 + }, + { + "epoch": 0.4240400667779633, + "grad_norm": 0.41263818740844727, + "learning_rate": 9.297863723802774e-05, + "loss": 1.9745, + "step": 1397 + }, + { + "epoch": 0.42434360297465473, + "grad_norm": 0.44245028495788574, + "learning_rate": 9.297357497215754e-05, + "loss": 1.6498, + "step": 1398 + }, + { + "epoch": 0.42464713917134617, + "grad_norm": 0.36662882566452026, + "learning_rate": 9.296851270628733e-05, + "loss": 1.9321, + "step": 1399 + }, + { + "epoch": 0.42495067536803766, + "grad_norm": 0.38561105728149414, + "learning_rate": 9.296345044041713e-05, + "loss": 1.8661, + "step": 1400 + }, + { + "epoch": 0.4252542115647291, + "grad_norm": 0.3688740134239197, + "learning_rate": 9.295838817454692e-05, + "loss": 2.1375, + "step": 1401 + }, + { + "epoch": 0.4255577477614205, + "grad_norm": 0.3883054256439209, + "learning_rate": 9.295332590867673e-05, + "loss": 1.486, + "step": 1402 + }, + { + "epoch": 0.425861283958112, + "grad_norm": 0.4107448160648346, + "learning_rate": 9.294826364280653e-05, + "loss": 1.9075, + "step": 1403 + }, + { + "epoch": 0.42616482015480345, + "grad_norm": 0.4174923896789551, + "learning_rate": 9.294320137693632e-05, + "loss": 2.0668, + "step": 1404 + }, + { + "epoch": 0.42646835635149494, + "grad_norm": 0.4573984444141388, + "learning_rate": 9.293813911106611e-05, + "loss": 1.8517, + "step": 1405 + }, + { + "epoch": 0.42677189254818637, + "grad_norm": 0.3820217251777649, + "learning_rate": 9.293307684519591e-05, + "loss": 1.7841, + "step": 1406 + }, + { + "epoch": 0.4270754287448778, + "grad_norm": 0.34213465452194214, + "learning_rate": 9.29280145793257e-05, + "loss": 1.9139, + "step": 1407 + }, + { + "epoch": 0.4273789649415693, + "grad_norm": 0.3995790481567383, + "learning_rate": 9.29229523134555e-05, + "loss": 1.6883, + "step": 1408 + }, + { + "epoch": 0.42768250113826073, + "grad_norm": 0.4142625331878662, + "learning_rate": 9.29178900475853e-05, + "loss": 2.0771, + "step": 1409 + }, + { + "epoch": 0.4279860373349522, + "grad_norm": 0.3818739354610443, + "learning_rate": 9.291282778171509e-05, + "loss": 1.6682, + "step": 1410 + }, + { + "epoch": 0.42828957353164365, + "grad_norm": 0.36996081471443176, + "learning_rate": 9.29077655158449e-05, + "loss": 2.1084, + "step": 1411 + }, + { + "epoch": 0.4285931097283351, + "grad_norm": 0.4592280983924866, + "learning_rate": 9.290270324997469e-05, + "loss": 1.4502, + "step": 1412 + }, + { + "epoch": 0.4288966459250266, + "grad_norm": 0.4243657886981964, + "learning_rate": 9.28976409841045e-05, + "loss": 1.8459, + "step": 1413 + }, + { + "epoch": 0.429200182121718, + "grad_norm": 0.4068589508533478, + "learning_rate": 9.28925787182343e-05, + "loss": 1.8392, + "step": 1414 + }, + { + "epoch": 0.42950371831840944, + "grad_norm": 0.3421384394168854, + "learning_rate": 9.288751645236409e-05, + "loss": 1.9204, + "step": 1415 + }, + { + "epoch": 0.42980725451510093, + "grad_norm": 0.36633387207984924, + "learning_rate": 9.288245418649388e-05, + "loss": 2.1934, + "step": 1416 + }, + { + "epoch": 0.43011079071179237, + "grad_norm": 0.6671120524406433, + "learning_rate": 9.287739192062368e-05, + "loss": 1.7614, + "step": 1417 + }, + { + "epoch": 0.43041432690848386, + "grad_norm": 0.3610883057117462, + "learning_rate": 9.287232965475347e-05, + "loss": 1.9075, + "step": 1418 + }, + { + "epoch": 0.4307178631051753, + "grad_norm": 0.42165407538414, + "learning_rate": 9.286726738888327e-05, + "loss": 1.4474, + "step": 1419 + }, + { + "epoch": 0.4310213993018667, + "grad_norm": 0.38051116466522217, + "learning_rate": 9.286220512301306e-05, + "loss": 1.7629, + "step": 1420 + }, + { + "epoch": 0.4313249354985582, + "grad_norm": 0.38990986347198486, + "learning_rate": 9.285714285714286e-05, + "loss": 1.7111, + "step": 1421 + }, + { + "epoch": 0.43162847169524965, + "grad_norm": 0.3510812222957611, + "learning_rate": 9.285208059127267e-05, + "loss": 1.7695, + "step": 1422 + }, + { + "epoch": 0.43193200789194114, + "grad_norm": 0.34757426381111145, + "learning_rate": 9.284701832540246e-05, + "loss": 2.173, + "step": 1423 + }, + { + "epoch": 0.4322355440886326, + "grad_norm": 0.3806573152542114, + "learning_rate": 9.284195605953226e-05, + "loss": 1.8029, + "step": 1424 + }, + { + "epoch": 0.432539080285324, + "grad_norm": 0.3845151662826538, + "learning_rate": 9.283689379366205e-05, + "loss": 1.902, + "step": 1425 + }, + { + "epoch": 0.4328426164820155, + "grad_norm": 0.40006932616233826, + "learning_rate": 9.283183152779185e-05, + "loss": 1.6436, + "step": 1426 + }, + { + "epoch": 0.43314615267870693, + "grad_norm": 0.5392235517501831, + "learning_rate": 9.282676926192164e-05, + "loss": 1.921, + "step": 1427 + }, + { + "epoch": 0.43344968887539836, + "grad_norm": 0.4523599147796631, + "learning_rate": 9.282170699605144e-05, + "loss": 1.7473, + "step": 1428 + }, + { + "epoch": 0.43375322507208985, + "grad_norm": 0.3809603154659271, + "learning_rate": 9.281664473018123e-05, + "loss": 1.5461, + "step": 1429 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.4202471375465393, + "learning_rate": 9.281158246431103e-05, + "loss": 1.995, + "step": 1430 + }, + { + "epoch": 0.4343602974654728, + "grad_norm": 0.42767444252967834, + "learning_rate": 9.280652019844083e-05, + "loss": 1.9536, + "step": 1431 + }, + { + "epoch": 0.4346638336621642, + "grad_norm": 0.4050025939941406, + "learning_rate": 9.280145793257063e-05, + "loss": 1.6169, + "step": 1432 + }, + { + "epoch": 0.43496736985885565, + "grad_norm": 0.4572995901107788, + "learning_rate": 9.279639566670042e-05, + "loss": 1.5711, + "step": 1433 + }, + { + "epoch": 0.43527090605554714, + "grad_norm": 0.4102776050567627, + "learning_rate": 9.279133340083022e-05, + "loss": 1.9844, + "step": 1434 + }, + { + "epoch": 0.43557444225223857, + "grad_norm": 0.4309599995613098, + "learning_rate": 9.278627113496001e-05, + "loss": 1.8742, + "step": 1435 + }, + { + "epoch": 0.43587797844893006, + "grad_norm": 0.34951043128967285, + "learning_rate": 9.278120886908981e-05, + "loss": 1.9262, + "step": 1436 + }, + { + "epoch": 0.4361815146456215, + "grad_norm": 0.47722557187080383, + "learning_rate": 9.27761466032196e-05, + "loss": 1.5605, + "step": 1437 + }, + { + "epoch": 0.4364850508423129, + "grad_norm": 0.37698620557785034, + "learning_rate": 9.27710843373494e-05, + "loss": 2.3081, + "step": 1438 + }, + { + "epoch": 0.4367885870390044, + "grad_norm": 0.40668490529060364, + "learning_rate": 9.276602207147919e-05, + "loss": 1.4524, + "step": 1439 + }, + { + "epoch": 0.43709212323569585, + "grad_norm": 0.4384947121143341, + "learning_rate": 9.276095980560899e-05, + "loss": 1.7878, + "step": 1440 + }, + { + "epoch": 0.4373956594323873, + "grad_norm": 3.140451192855835, + "learning_rate": 9.27558975397388e-05, + "loss": 1.7084, + "step": 1441 + }, + { + "epoch": 0.4376991956290788, + "grad_norm": 0.43369218707084656, + "learning_rate": 9.275083527386859e-05, + "loss": 2.0439, + "step": 1442 + }, + { + "epoch": 0.4380027318257702, + "grad_norm": 0.4725881516933441, + "learning_rate": 9.274577300799838e-05, + "loss": 2.0507, + "step": 1443 + }, + { + "epoch": 0.4383062680224617, + "grad_norm": 0.4496382474899292, + "learning_rate": 9.274071074212818e-05, + "loss": 2.1349, + "step": 1444 + }, + { + "epoch": 0.43860980421915313, + "grad_norm": 1.6437734365463257, + "learning_rate": 9.273564847625797e-05, + "loss": 1.9441, + "step": 1445 + }, + { + "epoch": 0.43891334041584457, + "grad_norm": 0.4106156527996063, + "learning_rate": 9.273058621038777e-05, + "loss": 1.4615, + "step": 1446 + }, + { + "epoch": 0.43921687661253606, + "grad_norm": 0.4387066960334778, + "learning_rate": 9.272552394451756e-05, + "loss": 1.8679, + "step": 1447 + }, + { + "epoch": 0.4395204128092275, + "grad_norm": 0.44515758752822876, + "learning_rate": 9.272046167864736e-05, + "loss": 1.9675, + "step": 1448 + }, + { + "epoch": 0.439823949005919, + "grad_norm": 0.43665841221809387, + "learning_rate": 9.271539941277715e-05, + "loss": 2.2782, + "step": 1449 + }, + { + "epoch": 0.4401274852026104, + "grad_norm": 0.3593182861804962, + "learning_rate": 9.271033714690696e-05, + "loss": 1.6537, + "step": 1450 + }, + { + "epoch": 0.44043102139930185, + "grad_norm": 0.38529497385025024, + "learning_rate": 9.270527488103676e-05, + "loss": 1.9399, + "step": 1451 + }, + { + "epoch": 0.44073455759599334, + "grad_norm": 0.42474156618118286, + "learning_rate": 9.270021261516655e-05, + "loss": 1.849, + "step": 1452 + }, + { + "epoch": 0.44103809379268477, + "grad_norm": 0.4505622684955597, + "learning_rate": 9.269515034929635e-05, + "loss": 1.9889, + "step": 1453 + }, + { + "epoch": 0.44134162998937626, + "grad_norm": 1.8219722509384155, + "learning_rate": 9.269008808342614e-05, + "loss": 2.1467, + "step": 1454 + }, + { + "epoch": 0.4416451661860677, + "grad_norm": 0.6941187381744385, + "learning_rate": 9.268502581755594e-05, + "loss": 2.1441, + "step": 1455 + }, + { + "epoch": 0.44194870238275913, + "grad_norm": 0.6262606978416443, + "learning_rate": 9.267996355168574e-05, + "loss": 1.9937, + "step": 1456 + }, + { + "epoch": 0.4422522385794506, + "grad_norm": 0.3790215253829956, + "learning_rate": 9.267490128581554e-05, + "loss": 1.7468, + "step": 1457 + }, + { + "epoch": 0.44255577477614205, + "grad_norm": 0.42074668407440186, + "learning_rate": 9.266983901994533e-05, + "loss": 2.1245, + "step": 1458 + }, + { + "epoch": 0.4428593109728335, + "grad_norm": 0.464870810508728, + "learning_rate": 9.266477675407513e-05, + "loss": 1.8672, + "step": 1459 + }, + { + "epoch": 0.443162847169525, + "grad_norm": 0.4551111161708832, + "learning_rate": 9.265971448820492e-05, + "loss": 2.054, + "step": 1460 + }, + { + "epoch": 0.4434663833662164, + "grad_norm": 0.3874572813510895, + "learning_rate": 9.265465222233473e-05, + "loss": 1.8281, + "step": 1461 + }, + { + "epoch": 0.4437699195629079, + "grad_norm": 0.44287312030792236, + "learning_rate": 9.264958995646453e-05, + "loss": 1.6435, + "step": 1462 + }, + { + "epoch": 0.44407345575959933, + "grad_norm": 0.41155338287353516, + "learning_rate": 9.264452769059432e-05, + "loss": 1.9611, + "step": 1463 + }, + { + "epoch": 0.44437699195629077, + "grad_norm": 0.480648398399353, + "learning_rate": 9.263946542472412e-05, + "loss": 1.7771, + "step": 1464 + }, + { + "epoch": 0.44468052815298226, + "grad_norm": 0.4704960286617279, + "learning_rate": 9.263440315885391e-05, + "loss": 0.6294, + "step": 1465 + }, + { + "epoch": 0.4449840643496737, + "grad_norm": 0.4150315821170807, + "learning_rate": 9.26293408929837e-05, + "loss": 1.7698, + "step": 1466 + }, + { + "epoch": 0.4452876005463652, + "grad_norm": 0.5981085300445557, + "learning_rate": 9.26242786271135e-05, + "loss": 1.7192, + "step": 1467 + }, + { + "epoch": 0.4455911367430566, + "grad_norm": 0.43365392088890076, + "learning_rate": 9.26192163612433e-05, + "loss": 1.8843, + "step": 1468 + }, + { + "epoch": 0.44589467293974805, + "grad_norm": 0.7336254715919495, + "learning_rate": 9.261415409537309e-05, + "loss": 2.0101, + "step": 1469 + }, + { + "epoch": 0.44619820913643954, + "grad_norm": 0.4002796411514282, + "learning_rate": 9.26090918295029e-05, + "loss": 1.9817, + "step": 1470 + }, + { + "epoch": 0.446501745333131, + "grad_norm": 0.4379813075065613, + "learning_rate": 9.26040295636327e-05, + "loss": 2.1091, + "step": 1471 + }, + { + "epoch": 0.4468052815298224, + "grad_norm": 0.4577115774154663, + "learning_rate": 9.259896729776249e-05, + "loss": 1.6132, + "step": 1472 + }, + { + "epoch": 0.4471088177265139, + "grad_norm": 0.40199458599090576, + "learning_rate": 9.259390503189228e-05, + "loss": 1.9815, + "step": 1473 + }, + { + "epoch": 0.44741235392320533, + "grad_norm": 0.4442947506904602, + "learning_rate": 9.258884276602208e-05, + "loss": 1.8425, + "step": 1474 + }, + { + "epoch": 0.4477158901198968, + "grad_norm": 0.3720739781856537, + "learning_rate": 9.258378050015187e-05, + "loss": 2.1161, + "step": 1475 + }, + { + "epoch": 0.44801942631658825, + "grad_norm": 0.39746803045272827, + "learning_rate": 9.257871823428167e-05, + "loss": 2.0404, + "step": 1476 + }, + { + "epoch": 0.4483229625132797, + "grad_norm": 0.4376835525035858, + "learning_rate": 9.257365596841146e-05, + "loss": 1.7201, + "step": 1477 + }, + { + "epoch": 0.4486264987099712, + "grad_norm": 0.35988250374794006, + "learning_rate": 9.256859370254126e-05, + "loss": 1.1999, + "step": 1478 + }, + { + "epoch": 0.4489300349066626, + "grad_norm": 0.41253864765167236, + "learning_rate": 9.256353143667105e-05, + "loss": 1.9916, + "step": 1479 + }, + { + "epoch": 0.4492335711033541, + "grad_norm": 0.34956973791122437, + "learning_rate": 9.255846917080086e-05, + "loss": 1.7406, + "step": 1480 + }, + { + "epoch": 0.44953710730004554, + "grad_norm": 0.452239453792572, + "learning_rate": 9.255340690493065e-05, + "loss": 2.0101, + "step": 1481 + }, + { + "epoch": 0.44984064349673697, + "grad_norm": 0.36039796471595764, + "learning_rate": 9.254834463906045e-05, + "loss": 1.9181, + "step": 1482 + }, + { + "epoch": 0.45014417969342846, + "grad_norm": 0.34030023217201233, + "learning_rate": 9.254328237319024e-05, + "loss": 1.6803, + "step": 1483 + }, + { + "epoch": 0.4504477158901199, + "grad_norm": 0.3585798144340515, + "learning_rate": 9.253822010732004e-05, + "loss": 1.8983, + "step": 1484 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.4554307758808136, + "learning_rate": 9.253315784144983e-05, + "loss": 1.741, + "step": 1485 + }, + { + "epoch": 0.4510547882835028, + "grad_norm": 0.36281803250312805, + "learning_rate": 9.252809557557963e-05, + "loss": 2.0279, + "step": 1486 + }, + { + "epoch": 0.45135832448019425, + "grad_norm": 0.4025228023529053, + "learning_rate": 9.252303330970942e-05, + "loss": 1.8517, + "step": 1487 + }, + { + "epoch": 0.45166186067688574, + "grad_norm": 0.3962991535663605, + "learning_rate": 9.251797104383922e-05, + "loss": 1.9199, + "step": 1488 + }, + { + "epoch": 0.4519653968735772, + "grad_norm": 0.4201490879058838, + "learning_rate": 9.251290877796903e-05, + "loss": 2.0137, + "step": 1489 + }, + { + "epoch": 0.4522689330702686, + "grad_norm": 0.4605710804462433, + "learning_rate": 9.250784651209882e-05, + "loss": 1.975, + "step": 1490 + }, + { + "epoch": 0.4525724692669601, + "grad_norm": 0.3571039140224457, + "learning_rate": 9.250278424622862e-05, + "loss": 1.8478, + "step": 1491 + }, + { + "epoch": 0.45287600546365153, + "grad_norm": 0.406676709651947, + "learning_rate": 9.249772198035841e-05, + "loss": 1.965, + "step": 1492 + }, + { + "epoch": 0.453179541660343, + "grad_norm": 0.6116447448730469, + "learning_rate": 9.24926597144882e-05, + "loss": 1.6192, + "step": 1493 + }, + { + "epoch": 0.45348307785703446, + "grad_norm": 0.4193543493747711, + "learning_rate": 9.2487597448618e-05, + "loss": 1.8085, + "step": 1494 + }, + { + "epoch": 0.4537866140537259, + "grad_norm": 0.4082903563976288, + "learning_rate": 9.24825351827478e-05, + "loss": 1.8924, + "step": 1495 + }, + { + "epoch": 0.4540901502504174, + "grad_norm": 0.4163326919078827, + "learning_rate": 9.247747291687759e-05, + "loss": 1.9238, + "step": 1496 + }, + { + "epoch": 0.4543936864471088, + "grad_norm": 0.4481281340122223, + "learning_rate": 9.247241065100739e-05, + "loss": 1.7663, + "step": 1497 + }, + { + "epoch": 0.45469722264380025, + "grad_norm": 0.3282391428947449, + "learning_rate": 9.24673483851372e-05, + "loss": 2.0332, + "step": 1498 + }, + { + "epoch": 0.45500075884049174, + "grad_norm": 0.43553873896598816, + "learning_rate": 9.246228611926699e-05, + "loss": 2.0712, + "step": 1499 + }, + { + "epoch": 0.45530429503718317, + "grad_norm": 0.40410909056663513, + "learning_rate": 9.245722385339678e-05, + "loss": 1.9479, + "step": 1500 + }, + { + "epoch": 0.45560783123387466, + "grad_norm": 0.36232396960258484, + "learning_rate": 9.245216158752659e-05, + "loss": 1.9359, + "step": 1501 + }, + { + "epoch": 0.4559113674305661, + "grad_norm": 0.44860419631004333, + "learning_rate": 9.244709932165639e-05, + "loss": 1.263, + "step": 1502 + }, + { + "epoch": 0.45621490362725753, + "grad_norm": 0.5308701395988464, + "learning_rate": 9.244203705578618e-05, + "loss": 2.2914, + "step": 1503 + }, + { + "epoch": 0.456518439823949, + "grad_norm": 0.4460773468017578, + "learning_rate": 9.243697478991598e-05, + "loss": 1.8063, + "step": 1504 + }, + { + "epoch": 0.45682197602064045, + "grad_norm": 0.4147963523864746, + "learning_rate": 9.243191252404577e-05, + "loss": 2.045, + "step": 1505 + }, + { + "epoch": 0.45712551221733194, + "grad_norm": 0.34958329796791077, + "learning_rate": 9.242685025817557e-05, + "loss": 1.8712, + "step": 1506 + }, + { + "epoch": 0.4574290484140234, + "grad_norm": 0.36072060465812683, + "learning_rate": 9.242178799230536e-05, + "loss": 1.7198, + "step": 1507 + }, + { + "epoch": 0.4577325846107148, + "grad_norm": 0.4608067274093628, + "learning_rate": 9.241672572643515e-05, + "loss": 1.7165, + "step": 1508 + }, + { + "epoch": 0.4580361208074063, + "grad_norm": 0.39580467343330383, + "learning_rate": 9.241166346056496e-05, + "loss": 1.665, + "step": 1509 + }, + { + "epoch": 0.45833965700409773, + "grad_norm": 0.4920599162578583, + "learning_rate": 9.240660119469476e-05, + "loss": 1.872, + "step": 1510 + }, + { + "epoch": 0.45864319320078917, + "grad_norm": 0.4332992136478424, + "learning_rate": 9.240153892882455e-05, + "loss": 1.8972, + "step": 1511 + }, + { + "epoch": 0.45894672939748066, + "grad_norm": 0.39618152379989624, + "learning_rate": 9.239647666295435e-05, + "loss": 2.0167, + "step": 1512 + }, + { + "epoch": 0.4592502655941721, + "grad_norm": 0.6713082790374756, + "learning_rate": 9.239141439708414e-05, + "loss": 2.0, + "step": 1513 + }, + { + "epoch": 0.4595538017908636, + "grad_norm": 0.34422579407691956, + "learning_rate": 9.238635213121394e-05, + "loss": 1.6438, + "step": 1514 + }, + { + "epoch": 0.459857337987555, + "grad_norm": 0.43874865770339966, + "learning_rate": 9.238128986534373e-05, + "loss": 1.6388, + "step": 1515 + }, + { + "epoch": 0.46016087418424645, + "grad_norm": 0.5863097906112671, + "learning_rate": 9.237622759947353e-05, + "loss": 1.6764, + "step": 1516 + }, + { + "epoch": 0.46046441038093794, + "grad_norm": 0.3312426805496216, + "learning_rate": 9.237116533360332e-05, + "loss": 1.8491, + "step": 1517 + }, + { + "epoch": 0.4607679465776294, + "grad_norm": 0.3111588656902313, + "learning_rate": 9.236610306773312e-05, + "loss": 2.0298, + "step": 1518 + }, + { + "epoch": 0.46107148277432086, + "grad_norm": 0.38705703616142273, + "learning_rate": 9.236104080186292e-05, + "loss": 2.0584, + "step": 1519 + }, + { + "epoch": 0.4613750189710123, + "grad_norm": 0.32613542675971985, + "learning_rate": 9.235597853599272e-05, + "loss": 1.8722, + "step": 1520 + }, + { + "epoch": 0.46167855516770373, + "grad_norm": 0.9304127097129822, + "learning_rate": 9.235091627012251e-05, + "loss": 1.978, + "step": 1521 + }, + { + "epoch": 0.4619820913643952, + "grad_norm": 0.3754931688308716, + "learning_rate": 9.234585400425231e-05, + "loss": 1.8724, + "step": 1522 + }, + { + "epoch": 0.46228562756108665, + "grad_norm": 0.4033370912075043, + "learning_rate": 9.23407917383821e-05, + "loss": 1.3349, + "step": 1523 + }, + { + "epoch": 0.4625891637577781, + "grad_norm": 0.35285013914108276, + "learning_rate": 9.23357294725119e-05, + "loss": 1.442, + "step": 1524 + }, + { + "epoch": 0.4628926999544696, + "grad_norm": 0.4044554531574249, + "learning_rate": 9.23306672066417e-05, + "loss": 2.0633, + "step": 1525 + }, + { + "epoch": 0.463196236151161, + "grad_norm": 0.46915552020072937, + "learning_rate": 9.232560494077149e-05, + "loss": 1.3861, + "step": 1526 + }, + { + "epoch": 0.4634997723478525, + "grad_norm": 0.4107852280139923, + "learning_rate": 9.232054267490128e-05, + "loss": 1.9011, + "step": 1527 + }, + { + "epoch": 0.46380330854454394, + "grad_norm": 0.4018856883049011, + "learning_rate": 9.231548040903109e-05, + "loss": 1.843, + "step": 1528 + }, + { + "epoch": 0.46410684474123537, + "grad_norm": 0.36814266443252563, + "learning_rate": 9.231041814316089e-05, + "loss": 1.897, + "step": 1529 + }, + { + "epoch": 0.46441038093792686, + "grad_norm": 0.42271214723587036, + "learning_rate": 9.230535587729068e-05, + "loss": 1.9761, + "step": 1530 + }, + { + "epoch": 0.4647139171346183, + "grad_norm": 0.4548446238040924, + "learning_rate": 9.230029361142048e-05, + "loss": 1.9313, + "step": 1531 + }, + { + "epoch": 0.4650174533313098, + "grad_norm": 0.4320158064365387, + "learning_rate": 9.229523134555027e-05, + "loss": 1.5687, + "step": 1532 + }, + { + "epoch": 0.4653209895280012, + "grad_norm": 0.3909349739551544, + "learning_rate": 9.229016907968007e-05, + "loss": 1.337, + "step": 1533 + }, + { + "epoch": 0.46562452572469265, + "grad_norm": 0.40204015374183655, + "learning_rate": 9.228510681380986e-05, + "loss": 1.9838, + "step": 1534 + }, + { + "epoch": 0.46592806192138414, + "grad_norm": 0.3997584879398346, + "learning_rate": 9.228004454793966e-05, + "loss": 1.8321, + "step": 1535 + }, + { + "epoch": 0.4662315981180756, + "grad_norm": 0.43689507246017456, + "learning_rate": 9.227498228206945e-05, + "loss": 2.0649, + "step": 1536 + }, + { + "epoch": 0.466535134314767, + "grad_norm": 0.3970150649547577, + "learning_rate": 9.226992001619926e-05, + "loss": 2.0077, + "step": 1537 + }, + { + "epoch": 0.4668386705114585, + "grad_norm": 0.3847435414791107, + "learning_rate": 9.226485775032905e-05, + "loss": 2.0168, + "step": 1538 + }, + { + "epoch": 0.46714220670814993, + "grad_norm": 0.40491220355033875, + "learning_rate": 9.225979548445885e-05, + "loss": 1.7831, + "step": 1539 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.401903361082077, + "learning_rate": 9.225473321858864e-05, + "loss": 2.007, + "step": 1540 + }, + { + "epoch": 0.46774927910153286, + "grad_norm": 0.6656989455223083, + "learning_rate": 9.224967095271844e-05, + "loss": 2.008, + "step": 1541 + }, + { + "epoch": 0.4680528152982243, + "grad_norm": 0.36222347617149353, + "learning_rate": 9.224460868684823e-05, + "loss": 1.8659, + "step": 1542 + }, + { + "epoch": 0.4683563514949158, + "grad_norm": 0.4396745562553406, + "learning_rate": 9.223954642097803e-05, + "loss": 1.9881, + "step": 1543 + }, + { + "epoch": 0.4686598876916072, + "grad_norm": 0.5994194149971008, + "learning_rate": 9.223448415510782e-05, + "loss": 2.1998, + "step": 1544 + }, + { + "epoch": 0.4689634238882987, + "grad_norm": 0.45558032393455505, + "learning_rate": 9.222942188923763e-05, + "loss": 1.6082, + "step": 1545 + }, + { + "epoch": 0.46926696008499014, + "grad_norm": 0.38417017459869385, + "learning_rate": 9.222435962336742e-05, + "loss": 1.7655, + "step": 1546 + }, + { + "epoch": 0.46957049628168157, + "grad_norm": 0.41946941614151, + "learning_rate": 9.221929735749722e-05, + "loss": 1.7472, + "step": 1547 + }, + { + "epoch": 0.46987403247837306, + "grad_norm": 0.39455467462539673, + "learning_rate": 9.221423509162703e-05, + "loss": 1.8377, + "step": 1548 + }, + { + "epoch": 0.4701775686750645, + "grad_norm": 0.3967253565788269, + "learning_rate": 9.220917282575682e-05, + "loss": 1.7838, + "step": 1549 + }, + { + "epoch": 0.470481104871756, + "grad_norm": 0.42535534501075745, + "learning_rate": 9.220411055988662e-05, + "loss": 2.1495, + "step": 1550 + }, + { + "epoch": 0.4707846410684474, + "grad_norm": 0.36706385016441345, + "learning_rate": 9.219904829401641e-05, + "loss": 1.9875, + "step": 1551 + }, + { + "epoch": 0.47108817726513885, + "grad_norm": 0.3747560977935791, + "learning_rate": 9.219398602814621e-05, + "loss": 1.5342, + "step": 1552 + }, + { + "epoch": 0.47139171346183034, + "grad_norm": 0.34010231494903564, + "learning_rate": 9.2188923762276e-05, + "loss": 2.0999, + "step": 1553 + }, + { + "epoch": 0.4716952496585218, + "grad_norm": 0.40051451325416565, + "learning_rate": 9.21838614964058e-05, + "loss": 1.8481, + "step": 1554 + }, + { + "epoch": 0.4719987858552132, + "grad_norm": 0.5217362642288208, + "learning_rate": 9.217879923053559e-05, + "loss": 1.4746, + "step": 1555 + }, + { + "epoch": 0.4723023220519047, + "grad_norm": 0.42339226603507996, + "learning_rate": 9.217373696466539e-05, + "loss": 1.947, + "step": 1556 + }, + { + "epoch": 0.47260585824859613, + "grad_norm": 0.3780953586101532, + "learning_rate": 9.216867469879518e-05, + "loss": 2.2093, + "step": 1557 + }, + { + "epoch": 0.4729093944452876, + "grad_norm": 0.38509401679039, + "learning_rate": 9.216361243292499e-05, + "loss": 1.6966, + "step": 1558 + }, + { + "epoch": 0.47321293064197906, + "grad_norm": 0.501438319683075, + "learning_rate": 9.215855016705478e-05, + "loss": 2.0505, + "step": 1559 + }, + { + "epoch": 0.4735164668386705, + "grad_norm": 0.42260050773620605, + "learning_rate": 9.215348790118458e-05, + "loss": 1.9439, + "step": 1560 + }, + { + "epoch": 0.473820003035362, + "grad_norm": 0.6031399965286255, + "learning_rate": 9.214842563531437e-05, + "loss": 1.9674, + "step": 1561 + }, + { + "epoch": 0.4741235392320534, + "grad_norm": 0.3809618055820465, + "learning_rate": 9.214336336944417e-05, + "loss": 1.9882, + "step": 1562 + }, + { + "epoch": 0.4744270754287449, + "grad_norm": 0.4074794352054596, + "learning_rate": 9.213830110357396e-05, + "loss": 1.6648, + "step": 1563 + }, + { + "epoch": 0.47473061162543634, + "grad_norm": 0.4380822479724884, + "learning_rate": 9.213323883770376e-05, + "loss": 2.1327, + "step": 1564 + }, + { + "epoch": 0.4750341478221278, + "grad_norm": 0.6130182147026062, + "learning_rate": 9.212817657183355e-05, + "loss": 1.957, + "step": 1565 + }, + { + "epoch": 0.47533768401881926, + "grad_norm": 0.359451025724411, + "learning_rate": 9.212311430596335e-05, + "loss": 1.5301, + "step": 1566 + }, + { + "epoch": 0.4756412202155107, + "grad_norm": 0.508237898349762, + "learning_rate": 9.211805204009316e-05, + "loss": 2.1409, + "step": 1567 + }, + { + "epoch": 0.47594475641220213, + "grad_norm": 0.5652433037757874, + "learning_rate": 9.211298977422295e-05, + "loss": 2.2032, + "step": 1568 + }, + { + "epoch": 0.4762482926088936, + "grad_norm": 0.36153456568717957, + "learning_rate": 9.210792750835275e-05, + "loss": 2.0994, + "step": 1569 + }, + { + "epoch": 0.47655182880558505, + "grad_norm": 0.4140501320362091, + "learning_rate": 9.210286524248254e-05, + "loss": 1.6165, + "step": 1570 + }, + { + "epoch": 0.47685536500227654, + "grad_norm": 0.36080101132392883, + "learning_rate": 9.209780297661234e-05, + "loss": 2.0203, + "step": 1571 + }, + { + "epoch": 0.477158901198968, + "grad_norm": 0.3501390218734741, + "learning_rate": 9.209274071074213e-05, + "loss": 1.9692, + "step": 1572 + }, + { + "epoch": 0.4774624373956594, + "grad_norm": 0.3753308653831482, + "learning_rate": 9.208767844487192e-05, + "loss": 1.799, + "step": 1573 + }, + { + "epoch": 0.4777659735923509, + "grad_norm": 0.3621695935726166, + "learning_rate": 9.208261617900172e-05, + "loss": 1.8412, + "step": 1574 + }, + { + "epoch": 0.47806950978904234, + "grad_norm": 0.4215545952320099, + "learning_rate": 9.207755391313151e-05, + "loss": 1.8227, + "step": 1575 + }, + { + "epoch": 0.4783730459857338, + "grad_norm": 0.32205232977867126, + "learning_rate": 9.207249164726132e-05, + "loss": 1.3949, + "step": 1576 + }, + { + "epoch": 0.47867658218242526, + "grad_norm": 0.34510162472724915, + "learning_rate": 9.206742938139112e-05, + "loss": 1.8627, + "step": 1577 + }, + { + "epoch": 0.4789801183791167, + "grad_norm": 0.41916847229003906, + "learning_rate": 9.206236711552091e-05, + "loss": 1.6164, + "step": 1578 + }, + { + "epoch": 0.4792836545758082, + "grad_norm": 0.323519229888916, + "learning_rate": 9.205730484965071e-05, + "loss": 1.5688, + "step": 1579 + }, + { + "epoch": 0.4795871907724996, + "grad_norm": 0.4150819778442383, + "learning_rate": 9.20522425837805e-05, + "loss": 1.8097, + "step": 1580 + }, + { + "epoch": 0.47989072696919105, + "grad_norm": 0.4045346975326538, + "learning_rate": 9.20471803179103e-05, + "loss": 1.91, + "step": 1581 + }, + { + "epoch": 0.48019426316588254, + "grad_norm": 0.3251115083694458, + "learning_rate": 9.204211805204009e-05, + "loss": 1.9278, + "step": 1582 + }, + { + "epoch": 0.480497799362574, + "grad_norm": 0.37068256735801697, + "learning_rate": 9.203705578616989e-05, + "loss": 1.8667, + "step": 1583 + }, + { + "epoch": 0.48080133555926546, + "grad_norm": 0.4208294749259949, + "learning_rate": 9.203199352029968e-05, + "loss": 1.9405, + "step": 1584 + }, + { + "epoch": 0.4811048717559569, + "grad_norm": 0.3996240794658661, + "learning_rate": 9.202693125442948e-05, + "loss": 1.6466, + "step": 1585 + }, + { + "epoch": 0.48140840795264833, + "grad_norm": 0.44182920455932617, + "learning_rate": 9.202186898855928e-05, + "loss": 2.0223, + "step": 1586 + }, + { + "epoch": 0.4817119441493398, + "grad_norm": 0.43203607201576233, + "learning_rate": 9.201680672268908e-05, + "loss": 1.7969, + "step": 1587 + }, + { + "epoch": 0.48201548034603126, + "grad_norm": 0.3604522943496704, + "learning_rate": 9.201174445681887e-05, + "loss": 2.0201, + "step": 1588 + }, + { + "epoch": 0.48231901654272274, + "grad_norm": 0.4073752760887146, + "learning_rate": 9.200668219094867e-05, + "loss": 1.993, + "step": 1589 + }, + { + "epoch": 0.4826225527394142, + "grad_norm": 0.39307650923728943, + "learning_rate": 9.200161992507848e-05, + "loss": 2.3445, + "step": 1590 + }, + { + "epoch": 0.4829260889361056, + "grad_norm": 0.355831503868103, + "learning_rate": 9.199655765920827e-05, + "loss": 2.0101, + "step": 1591 + }, + { + "epoch": 0.4832296251327971, + "grad_norm": 0.5814805030822754, + "learning_rate": 9.199149539333807e-05, + "loss": 2.2421, + "step": 1592 + }, + { + "epoch": 0.48353316132948854, + "grad_norm": 0.4290510416030884, + "learning_rate": 9.198643312746786e-05, + "loss": 2.1818, + "step": 1593 + }, + { + "epoch": 0.48383669752617997, + "grad_norm": 7.360002040863037, + "learning_rate": 9.198137086159766e-05, + "loss": 2.0011, + "step": 1594 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.5217785835266113, + "learning_rate": 9.197630859572745e-05, + "loss": 1.817, + "step": 1595 + }, + { + "epoch": 0.4844437699195629, + "grad_norm": 0.4188072383403778, + "learning_rate": 9.197124632985725e-05, + "loss": 1.5588, + "step": 1596 + }, + { + "epoch": 0.4847473061162544, + "grad_norm": 0.4220346212387085, + "learning_rate": 9.196618406398705e-05, + "loss": 1.9217, + "step": 1597 + }, + { + "epoch": 0.4850508423129458, + "grad_norm": 0.5502439141273499, + "learning_rate": 9.196112179811685e-05, + "loss": 1.841, + "step": 1598 + }, + { + "epoch": 0.48535437850963725, + "grad_norm": 0.4167909622192383, + "learning_rate": 9.195605953224664e-05, + "loss": 1.7868, + "step": 1599 + }, + { + "epoch": 0.48565791470632874, + "grad_norm": 0.45999062061309814, + "learning_rate": 9.195099726637644e-05, + "loss": 1.8886, + "step": 1600 + }, + { + "epoch": 0.4859614509030202, + "grad_norm": 0.3937031626701355, + "learning_rate": 9.194593500050623e-05, + "loss": 1.7736, + "step": 1601 + }, + { + "epoch": 0.48626498709971167, + "grad_norm": 0.44424453377723694, + "learning_rate": 9.194087273463603e-05, + "loss": 1.6786, + "step": 1602 + }, + { + "epoch": 0.4865685232964031, + "grad_norm": 0.35432350635528564, + "learning_rate": 9.193581046876582e-05, + "loss": 1.8425, + "step": 1603 + }, + { + "epoch": 0.48687205949309453, + "grad_norm": 0.41191428899765015, + "learning_rate": 9.193074820289562e-05, + "loss": 1.342, + "step": 1604 + }, + { + "epoch": 0.487175595689786, + "grad_norm": 0.4410790503025055, + "learning_rate": 9.192568593702541e-05, + "loss": 1.3158, + "step": 1605 + }, + { + "epoch": 0.48747913188647746, + "grad_norm": 0.4214244782924652, + "learning_rate": 9.192062367115522e-05, + "loss": 2.1983, + "step": 1606 + }, + { + "epoch": 0.4877826680831689, + "grad_norm": 0.4066750109195709, + "learning_rate": 9.191556140528502e-05, + "loss": 1.5839, + "step": 1607 + }, + { + "epoch": 0.4880862042798604, + "grad_norm": 0.5248275995254517, + "learning_rate": 9.191049913941481e-05, + "loss": 1.6415, + "step": 1608 + }, + { + "epoch": 0.4883897404765518, + "grad_norm": 0.3945814073085785, + "learning_rate": 9.19054368735446e-05, + "loss": 1.6788, + "step": 1609 + }, + { + "epoch": 0.4886932766732433, + "grad_norm": 0.42285215854644775, + "learning_rate": 9.19003746076744e-05, + "loss": 1.7365, + "step": 1610 + }, + { + "epoch": 0.48899681286993474, + "grad_norm": 0.43051236867904663, + "learning_rate": 9.18953123418042e-05, + "loss": 1.8906, + "step": 1611 + }, + { + "epoch": 0.4893003490666262, + "grad_norm": 0.4336687922477722, + "learning_rate": 9.189025007593399e-05, + "loss": 1.6145, + "step": 1612 + }, + { + "epoch": 0.48960388526331766, + "grad_norm": 0.34237489104270935, + "learning_rate": 9.188518781006378e-05, + "loss": 1.9992, + "step": 1613 + }, + { + "epoch": 0.4899074214600091, + "grad_norm": 0.4344857931137085, + "learning_rate": 9.188012554419358e-05, + "loss": 1.9943, + "step": 1614 + }, + { + "epoch": 0.4902109576567006, + "grad_norm": 0.3851914703845978, + "learning_rate": 9.187506327832339e-05, + "loss": 1.8428, + "step": 1615 + }, + { + "epoch": 0.490514493853392, + "grad_norm": 0.39165550470352173, + "learning_rate": 9.187000101245318e-05, + "loss": 1.7958, + "step": 1616 + }, + { + "epoch": 0.49081803005008345, + "grad_norm": 0.34605157375335693, + "learning_rate": 9.186493874658298e-05, + "loss": 1.9257, + "step": 1617 + }, + { + "epoch": 0.49112156624677494, + "grad_norm": 0.422831654548645, + "learning_rate": 9.185987648071277e-05, + "loss": 2.1828, + "step": 1618 + }, + { + "epoch": 0.4914251024434664, + "grad_norm": 0.7868388891220093, + "learning_rate": 9.185481421484257e-05, + "loss": 1.4172, + "step": 1619 + }, + { + "epoch": 0.4917286386401578, + "grad_norm": 0.3971206247806549, + "learning_rate": 9.184975194897236e-05, + "loss": 1.8442, + "step": 1620 + }, + { + "epoch": 0.4920321748368493, + "grad_norm": 0.39479488134384155, + "learning_rate": 9.184468968310216e-05, + "loss": 1.6141, + "step": 1621 + }, + { + "epoch": 0.49233571103354073, + "grad_norm": 2.7340400218963623, + "learning_rate": 9.183962741723195e-05, + "loss": 1.7567, + "step": 1622 + }, + { + "epoch": 0.4926392472302322, + "grad_norm": 0.7024746537208557, + "learning_rate": 9.183456515136175e-05, + "loss": 2.3221, + "step": 1623 + }, + { + "epoch": 0.49294278342692366, + "grad_norm": 0.3881623148918152, + "learning_rate": 9.182950288549154e-05, + "loss": 2.0143, + "step": 1624 + }, + { + "epoch": 0.4932463196236151, + "grad_norm": 0.35226500034332275, + "learning_rate": 9.182444061962135e-05, + "loss": 1.8097, + "step": 1625 + }, + { + "epoch": 0.4935498558203066, + "grad_norm": 0.9839766621589661, + "learning_rate": 9.181937835375114e-05, + "loss": 1.9594, + "step": 1626 + }, + { + "epoch": 0.493853392016998, + "grad_norm": 0.333279013633728, + "learning_rate": 9.181431608788094e-05, + "loss": 1.8533, + "step": 1627 + }, + { + "epoch": 0.4941569282136895, + "grad_norm": 0.6945008039474487, + "learning_rate": 9.180925382201073e-05, + "loss": 1.3658, + "step": 1628 + }, + { + "epoch": 0.49446046441038094, + "grad_norm": 0.4481600224971771, + "learning_rate": 9.180419155614053e-05, + "loss": 1.9189, + "step": 1629 + }, + { + "epoch": 0.4947640006070724, + "grad_norm": 0.35472220182418823, + "learning_rate": 9.179912929027032e-05, + "loss": 1.3206, + "step": 1630 + }, + { + "epoch": 0.49506753680376386, + "grad_norm": 0.5124238729476929, + "learning_rate": 9.179406702440012e-05, + "loss": 2.0371, + "step": 1631 + }, + { + "epoch": 0.4953710730004553, + "grad_norm": 0.3843775987625122, + "learning_rate": 9.178900475852991e-05, + "loss": 1.5858, + "step": 1632 + }, + { + "epoch": 0.4956746091971468, + "grad_norm": 0.41060924530029297, + "learning_rate": 9.178394249265971e-05, + "loss": 1.4591, + "step": 1633 + }, + { + "epoch": 0.4959781453938382, + "grad_norm": 0.5426920056343079, + "learning_rate": 9.177888022678952e-05, + "loss": 2.2744, + "step": 1634 + }, + { + "epoch": 0.49628168159052966, + "grad_norm": 0.4275033175945282, + "learning_rate": 9.177381796091931e-05, + "loss": 1.9274, + "step": 1635 + }, + { + "epoch": 0.49658521778722114, + "grad_norm": 0.4715273976325989, + "learning_rate": 9.176875569504912e-05, + "loss": 1.5788, + "step": 1636 + }, + { + "epoch": 0.4968887539839126, + "grad_norm": 0.41464027762413025, + "learning_rate": 9.176369342917891e-05, + "loss": 1.8147, + "step": 1637 + }, + { + "epoch": 0.497192290180604, + "grad_norm": 0.4175771176815033, + "learning_rate": 9.175863116330871e-05, + "loss": 2.02, + "step": 1638 + }, + { + "epoch": 0.4974958263772955, + "grad_norm": 0.42781904339790344, + "learning_rate": 9.17535688974385e-05, + "loss": 1.8772, + "step": 1639 + }, + { + "epoch": 0.49779936257398694, + "grad_norm": 0.381352961063385, + "learning_rate": 9.17485066315683e-05, + "loss": 1.9982, + "step": 1640 + }, + { + "epoch": 0.4981028987706784, + "grad_norm": 0.44887885451316833, + "learning_rate": 9.174344436569809e-05, + "loss": 1.6724, + "step": 1641 + }, + { + "epoch": 0.49840643496736986, + "grad_norm": 0.3764267563819885, + "learning_rate": 9.173838209982789e-05, + "loss": 1.7327, + "step": 1642 + }, + { + "epoch": 0.4987099711640613, + "grad_norm": 0.6911460161209106, + "learning_rate": 9.173331983395768e-05, + "loss": 2.1353, + "step": 1643 + }, + { + "epoch": 0.4990135073607528, + "grad_norm": 0.39581048488616943, + "learning_rate": 9.172825756808748e-05, + "loss": 2.1394, + "step": 1644 + }, + { + "epoch": 0.4993170435574442, + "grad_norm": 0.420389860868454, + "learning_rate": 9.172319530221729e-05, + "loss": 2.0948, + "step": 1645 + }, + { + "epoch": 0.4996205797541357, + "grad_norm": 0.3843049108982086, + "learning_rate": 9.171813303634708e-05, + "loss": 2.0618, + "step": 1646 + }, + { + "epoch": 0.49992411595082714, + "grad_norm": 0.3946545422077179, + "learning_rate": 9.171307077047688e-05, + "loss": 1.7997, + "step": 1647 + }, + { + "epoch": 0.5002276521475186, + "grad_norm": 0.3740834593772888, + "learning_rate": 9.170800850460667e-05, + "loss": 1.8436, + "step": 1648 + }, + { + "epoch": 0.5005311883442101, + "grad_norm": 0.42691826820373535, + "learning_rate": 9.170294623873646e-05, + "loss": 1.8915, + "step": 1649 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.41487646102905273, + "learning_rate": 9.169788397286626e-05, + "loss": 1.6712, + "step": 1650 + }, + { + "epoch": 0.5011382607375929, + "grad_norm": 0.44870665669441223, + "learning_rate": 9.169282170699605e-05, + "loss": 1.3704, + "step": 1651 + }, + { + "epoch": 0.5014417969342844, + "grad_norm": 0.5584750771522522, + "learning_rate": 9.168775944112585e-05, + "loss": 2.2258, + "step": 1652 + }, + { + "epoch": 0.5017453331309759, + "grad_norm": 0.4336828291416168, + "learning_rate": 9.168269717525564e-05, + "loss": 2.0302, + "step": 1653 + }, + { + "epoch": 0.5020488693276673, + "grad_norm": 0.3990234434604645, + "learning_rate": 9.167763490938545e-05, + "loss": 1.9813, + "step": 1654 + }, + { + "epoch": 0.5023524055243588, + "grad_norm": 0.42252814769744873, + "learning_rate": 9.167257264351525e-05, + "loss": 1.7596, + "step": 1655 + }, + { + "epoch": 0.5026559417210502, + "grad_norm": 0.42766478657722473, + "learning_rate": 9.166751037764504e-05, + "loss": 1.4622, + "step": 1656 + }, + { + "epoch": 0.5029594779177416, + "grad_norm": 0.4347383975982666, + "learning_rate": 9.166244811177484e-05, + "loss": 1.4987, + "step": 1657 + }, + { + "epoch": 0.5032630141144332, + "grad_norm": 0.3660615384578705, + "learning_rate": 9.165738584590463e-05, + "loss": 1.3758, + "step": 1658 + }, + { + "epoch": 0.5035665503111246, + "grad_norm": 0.3933682441711426, + "learning_rate": 9.165232358003443e-05, + "loss": 1.9097, + "step": 1659 + }, + { + "epoch": 0.5038700865078161, + "grad_norm": 0.39718765020370483, + "learning_rate": 9.164726131416422e-05, + "loss": 1.9825, + "step": 1660 + }, + { + "epoch": 0.5041736227045075, + "grad_norm": 0.4161352515220642, + "learning_rate": 9.164219904829402e-05, + "loss": 1.6346, + "step": 1661 + }, + { + "epoch": 0.5044771589011989, + "grad_norm": 0.413492888212204, + "learning_rate": 9.163713678242381e-05, + "loss": 1.9286, + "step": 1662 + }, + { + "epoch": 0.5047806950978905, + "grad_norm": 0.4211573600769043, + "learning_rate": 9.16320745165536e-05, + "loss": 1.5557, + "step": 1663 + }, + { + "epoch": 0.5050842312945819, + "grad_norm": 0.3247505724430084, + "learning_rate": 9.162701225068341e-05, + "loss": 1.8372, + "step": 1664 + }, + { + "epoch": 0.5053877674912733, + "grad_norm": 0.699242889881134, + "learning_rate": 9.162194998481321e-05, + "loss": 1.3331, + "step": 1665 + }, + { + "epoch": 0.5056913036879648, + "grad_norm": 0.45382243394851685, + "learning_rate": 9.1616887718943e-05, + "loss": 1.5317, + "step": 1666 + }, + { + "epoch": 0.5059948398846562, + "grad_norm": 0.37562644481658936, + "learning_rate": 9.16118254530728e-05, + "loss": 1.4138, + "step": 1667 + }, + { + "epoch": 0.5062983760813476, + "grad_norm": 0.41830095648765564, + "learning_rate": 9.160676318720259e-05, + "loss": 2.0788, + "step": 1668 + }, + { + "epoch": 0.5066019122780392, + "grad_norm": 0.4154708981513977, + "learning_rate": 9.160170092133239e-05, + "loss": 2.0095, + "step": 1669 + }, + { + "epoch": 0.5069054484747306, + "grad_norm": 0.3693794906139374, + "learning_rate": 9.159663865546218e-05, + "loss": 1.8871, + "step": 1670 + }, + { + "epoch": 0.5072089846714221, + "grad_norm": 0.42712700366973877, + "learning_rate": 9.159157638959198e-05, + "loss": 1.9114, + "step": 1671 + }, + { + "epoch": 0.5075125208681135, + "grad_norm": 0.406843900680542, + "learning_rate": 9.158651412372177e-05, + "loss": 1.7887, + "step": 1672 + }, + { + "epoch": 0.5078160570648049, + "grad_norm": 0.3689083456993103, + "learning_rate": 9.158145185785158e-05, + "loss": 1.8421, + "step": 1673 + }, + { + "epoch": 0.5081195932614965, + "grad_norm": 0.40796002745628357, + "learning_rate": 9.157638959198138e-05, + "loss": 1.5014, + "step": 1674 + }, + { + "epoch": 0.5084231294581879, + "grad_norm": 0.44102364778518677, + "learning_rate": 9.157132732611117e-05, + "loss": 1.5184, + "step": 1675 + }, + { + "epoch": 0.5087266656548793, + "grad_norm": 0.4265199899673462, + "learning_rate": 9.156626506024096e-05, + "loss": 2.017, + "step": 1676 + }, + { + "epoch": 0.5090302018515708, + "grad_norm": 0.4618091285228729, + "learning_rate": 9.156120279437076e-05, + "loss": 2.056, + "step": 1677 + }, + { + "epoch": 0.5093337380482622, + "grad_norm": 0.4058600068092346, + "learning_rate": 9.155614052850055e-05, + "loss": 1.9897, + "step": 1678 + }, + { + "epoch": 0.5096372742449538, + "grad_norm": 0.46722692251205444, + "learning_rate": 9.155107826263036e-05, + "loss": 1.9713, + "step": 1679 + }, + { + "epoch": 0.5099408104416452, + "grad_norm": 0.36259156465530396, + "learning_rate": 9.154601599676016e-05, + "loss": 1.9321, + "step": 1680 + }, + { + "epoch": 0.5102443466383366, + "grad_norm": 0.366148442029953, + "learning_rate": 9.154095373088995e-05, + "loss": 1.9573, + "step": 1681 + }, + { + "epoch": 0.510547882835028, + "grad_norm": 0.3328361213207245, + "learning_rate": 9.153589146501975e-05, + "loss": 1.8222, + "step": 1682 + }, + { + "epoch": 0.5108514190317195, + "grad_norm": 0.45891711115837097, + "learning_rate": 9.153082919914954e-05, + "loss": 1.7177, + "step": 1683 + }, + { + "epoch": 0.511154955228411, + "grad_norm": 0.4405977427959442, + "learning_rate": 9.152576693327935e-05, + "loss": 1.8499, + "step": 1684 + }, + { + "epoch": 0.5114584914251025, + "grad_norm": 0.7388264536857605, + "learning_rate": 9.152070466740915e-05, + "loss": 1.9884, + "step": 1685 + }, + { + "epoch": 0.5117620276217939, + "grad_norm": 0.43892955780029297, + "learning_rate": 9.151564240153894e-05, + "loss": 2.0027, + "step": 1686 + }, + { + "epoch": 0.5120655638184853, + "grad_norm": 0.42659783363342285, + "learning_rate": 9.151058013566873e-05, + "loss": 1.8386, + "step": 1687 + }, + { + "epoch": 0.5123691000151768, + "grad_norm": 0.4364768862724304, + "learning_rate": 9.150551786979853e-05, + "loss": 1.6248, + "step": 1688 + }, + { + "epoch": 0.5126726362118683, + "grad_norm": 0.35849112272262573, + "learning_rate": 9.150045560392832e-05, + "loss": 2.0983, + "step": 1689 + }, + { + "epoch": 0.5129761724085597, + "grad_norm": 0.38595572113990784, + "learning_rate": 9.149539333805812e-05, + "loss": 1.956, + "step": 1690 + }, + { + "epoch": 0.5132797086052512, + "grad_norm": 0.4161504805088043, + "learning_rate": 9.149033107218791e-05, + "loss": 1.8132, + "step": 1691 + }, + { + "epoch": 0.5135832448019426, + "grad_norm": 0.6614299416542053, + "learning_rate": 9.148526880631771e-05, + "loss": 1.4403, + "step": 1692 + }, + { + "epoch": 0.513886780998634, + "grad_norm": 0.4609692692756653, + "learning_rate": 9.148020654044752e-05, + "loss": 1.9215, + "step": 1693 + }, + { + "epoch": 0.5141903171953256, + "grad_norm": 0.4489036202430725, + "learning_rate": 9.147514427457731e-05, + "loss": 1.7922, + "step": 1694 + }, + { + "epoch": 0.514493853392017, + "grad_norm": 0.46497032046318054, + "learning_rate": 9.14700820087071e-05, + "loss": 1.6058, + "step": 1695 + }, + { + "epoch": 0.5147973895887085, + "grad_norm": 0.39706695079803467, + "learning_rate": 9.14650197428369e-05, + "loss": 1.7717, + "step": 1696 + }, + { + "epoch": 0.5151009257853999, + "grad_norm": 0.3839566111564636, + "learning_rate": 9.14599574769667e-05, + "loss": 1.8218, + "step": 1697 + }, + { + "epoch": 0.5154044619820913, + "grad_norm": 0.7339301109313965, + "learning_rate": 9.145489521109649e-05, + "loss": 1.9836, + "step": 1698 + }, + { + "epoch": 0.5157079981787828, + "grad_norm": 0.4512780010700226, + "learning_rate": 9.144983294522629e-05, + "loss": 2.0034, + "step": 1699 + }, + { + "epoch": 0.5160115343754743, + "grad_norm": 1.845346212387085, + "learning_rate": 9.144477067935608e-05, + "loss": 1.6995, + "step": 1700 + }, + { + "epoch": 0.5163150705721657, + "grad_norm": 0.42541632056236267, + "learning_rate": 9.143970841348588e-05, + "loss": 2.0264, + "step": 1701 + }, + { + "epoch": 0.5166186067688572, + "grad_norm": 0.404821515083313, + "learning_rate": 9.143464614761567e-05, + "loss": 1.9064, + "step": 1702 + }, + { + "epoch": 0.5169221429655486, + "grad_norm": 0.4223015606403351, + "learning_rate": 9.142958388174548e-05, + "loss": 1.8442, + "step": 1703 + }, + { + "epoch": 0.51722567916224, + "grad_norm": 0.38094672560691833, + "learning_rate": 9.142452161587527e-05, + "loss": 1.7625, + "step": 1704 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.3759573698043823, + "learning_rate": 9.141945935000507e-05, + "loss": 2.0585, + "step": 1705 + }, + { + "epoch": 0.517832751555623, + "grad_norm": 0.3938165307044983, + "learning_rate": 9.141439708413486e-05, + "loss": 1.9594, + "step": 1706 + }, + { + "epoch": 0.5181362877523145, + "grad_norm": 0.4222012758255005, + "learning_rate": 9.140933481826466e-05, + "loss": 1.1698, + "step": 1707 + }, + { + "epoch": 0.5184398239490059, + "grad_norm": 0.419763445854187, + "learning_rate": 9.140427255239445e-05, + "loss": 1.9484, + "step": 1708 + }, + { + "epoch": 0.5187433601456973, + "grad_norm": 0.4546319544315338, + "learning_rate": 9.139921028652425e-05, + "loss": 1.9924, + "step": 1709 + }, + { + "epoch": 0.5190468963423889, + "grad_norm": 0.5007880330085754, + "learning_rate": 9.139414802065404e-05, + "loss": 2.0619, + "step": 1710 + }, + { + "epoch": 0.5193504325390803, + "grad_norm": 0.3647090494632721, + "learning_rate": 9.138908575478384e-05, + "loss": 1.9504, + "step": 1711 + }, + { + "epoch": 0.5196539687357717, + "grad_norm": 0.4546000063419342, + "learning_rate": 9.138402348891365e-05, + "loss": 2.0943, + "step": 1712 + }, + { + "epoch": 0.5199575049324632, + "grad_norm": 0.36992448568344116, + "learning_rate": 9.137896122304344e-05, + "loss": 1.8111, + "step": 1713 + }, + { + "epoch": 0.5202610411291546, + "grad_norm": 0.40882760286331177, + "learning_rate": 9.137389895717323e-05, + "loss": 1.8935, + "step": 1714 + }, + { + "epoch": 0.5205645773258462, + "grad_norm": 0.39158037304878235, + "learning_rate": 9.136883669130303e-05, + "loss": 1.4147, + "step": 1715 + }, + { + "epoch": 0.5208681135225376, + "grad_norm": 0.42174550890922546, + "learning_rate": 9.136377442543282e-05, + "loss": 1.5931, + "step": 1716 + }, + { + "epoch": 0.521171649719229, + "grad_norm": 0.4003652036190033, + "learning_rate": 9.135871215956262e-05, + "loss": 1.4119, + "step": 1717 + }, + { + "epoch": 0.5214751859159205, + "grad_norm": 0.42328763008117676, + "learning_rate": 9.135364989369241e-05, + "loss": 1.6943, + "step": 1718 + }, + { + "epoch": 0.5217787221126119, + "grad_norm": 0.3831746578216553, + "learning_rate": 9.134858762782221e-05, + "loss": 1.8067, + "step": 1719 + }, + { + "epoch": 0.5220822583093034, + "grad_norm": 0.4160243272781372, + "learning_rate": 9.1343525361952e-05, + "loss": 2.0725, + "step": 1720 + }, + { + "epoch": 0.5223857945059949, + "grad_norm": 0.47441422939300537, + "learning_rate": 9.133846309608181e-05, + "loss": 2.2569, + "step": 1721 + }, + { + "epoch": 0.5226893307026863, + "grad_norm": 0.34522169828414917, + "learning_rate": 9.133340083021161e-05, + "loss": 1.6977, + "step": 1722 + }, + { + "epoch": 0.5229928668993777, + "grad_norm": 0.6760712265968323, + "learning_rate": 9.132833856434142e-05, + "loss": 1.7252, + "step": 1723 + }, + { + "epoch": 0.5232964030960692, + "grad_norm": 0.42016392946243286, + "learning_rate": 9.132327629847121e-05, + "loss": 1.9835, + "step": 1724 + }, + { + "epoch": 0.5235999392927606, + "grad_norm": 0.4062696099281311, + "learning_rate": 9.1318214032601e-05, + "loss": 1.8181, + "step": 1725 + }, + { + "epoch": 0.5239034754894522, + "grad_norm": 0.37092477083206177, + "learning_rate": 9.13131517667308e-05, + "loss": 1.9989, + "step": 1726 + }, + { + "epoch": 0.5242070116861436, + "grad_norm": 0.30382564663887024, + "learning_rate": 9.13080895008606e-05, + "loss": 1.5613, + "step": 1727 + }, + { + "epoch": 0.524510547882835, + "grad_norm": 0.39715448021888733, + "learning_rate": 9.130302723499039e-05, + "loss": 1.8396, + "step": 1728 + }, + { + "epoch": 0.5248140840795265, + "grad_norm": 0.698819637298584, + "learning_rate": 9.129796496912018e-05, + "loss": 1.6617, + "step": 1729 + }, + { + "epoch": 0.5251176202762179, + "grad_norm": 0.37083616852760315, + "learning_rate": 9.129290270324998e-05, + "loss": 1.0619, + "step": 1730 + }, + { + "epoch": 0.5254211564729094, + "grad_norm": 0.37196993827819824, + "learning_rate": 9.128784043737977e-05, + "loss": 1.4654, + "step": 1731 + }, + { + "epoch": 0.5257246926696009, + "grad_norm": 0.38970932364463806, + "learning_rate": 9.128277817150958e-05, + "loss": 1.9632, + "step": 1732 + }, + { + "epoch": 0.5260282288662923, + "grad_norm": 0.4937323033809662, + "learning_rate": 9.127771590563938e-05, + "loss": 1.5989, + "step": 1733 + }, + { + "epoch": 0.5263317650629837, + "grad_norm": 0.37157008051872253, + "learning_rate": 9.127265363976917e-05, + "loss": 1.8486, + "step": 1734 + }, + { + "epoch": 0.5266353012596752, + "grad_norm": 0.3973872661590576, + "learning_rate": 9.126759137389897e-05, + "loss": 1.5195, + "step": 1735 + }, + { + "epoch": 0.5269388374563667, + "grad_norm": 0.3511494994163513, + "learning_rate": 9.126252910802876e-05, + "loss": 1.9055, + "step": 1736 + }, + { + "epoch": 0.5272423736530581, + "grad_norm": 0.36223629117012024, + "learning_rate": 9.125746684215856e-05, + "loss": 1.5545, + "step": 1737 + }, + { + "epoch": 0.5275459098497496, + "grad_norm": 0.4978778660297394, + "learning_rate": 9.125240457628835e-05, + "loss": 1.9145, + "step": 1738 + }, + { + "epoch": 0.527849446046441, + "grad_norm": 0.3191153407096863, + "learning_rate": 9.124734231041815e-05, + "loss": 1.754, + "step": 1739 + }, + { + "epoch": 0.5281529822431325, + "grad_norm": 0.39094769954681396, + "learning_rate": 9.124228004454794e-05, + "loss": 1.9462, + "step": 1740 + }, + { + "epoch": 0.528456518439824, + "grad_norm": 0.6246857047080994, + "learning_rate": 9.123721777867774e-05, + "loss": 2.0239, + "step": 1741 + }, + { + "epoch": 0.5287600546365154, + "grad_norm": 0.41962483525276184, + "learning_rate": 9.123215551280754e-05, + "loss": 1.9372, + "step": 1742 + }, + { + "epoch": 0.5290635908332069, + "grad_norm": 0.3055092394351959, + "learning_rate": 9.122709324693734e-05, + "loss": 0.9516, + "step": 1743 + }, + { + "epoch": 0.5293671270298983, + "grad_norm": 0.4911038875579834, + "learning_rate": 9.122203098106713e-05, + "loss": 1.7127, + "step": 1744 + }, + { + "epoch": 0.5296706632265897, + "grad_norm": 0.7481783032417297, + "learning_rate": 9.121696871519693e-05, + "loss": 2.1368, + "step": 1745 + }, + { + "epoch": 0.5299741994232813, + "grad_norm": 0.4397221803665161, + "learning_rate": 9.121190644932672e-05, + "loss": 1.978, + "step": 1746 + }, + { + "epoch": 0.5302777356199727, + "grad_norm": 0.3751915991306305, + "learning_rate": 9.120684418345652e-05, + "loss": 2.0627, + "step": 1747 + }, + { + "epoch": 0.5305812718166641, + "grad_norm": 0.474575400352478, + "learning_rate": 9.120178191758631e-05, + "loss": 2.018, + "step": 1748 + }, + { + "epoch": 0.5308848080133556, + "grad_norm": 0.3762502372264862, + "learning_rate": 9.119671965171611e-05, + "loss": 1.7076, + "step": 1749 + }, + { + "epoch": 0.531188344210047, + "grad_norm": 0.4058527946472168, + "learning_rate": 9.11916573858459e-05, + "loss": 1.768, + "step": 1750 + }, + { + "epoch": 0.5314918804067384, + "grad_norm": 0.3765137791633606, + "learning_rate": 9.118659511997571e-05, + "loss": 1.8357, + "step": 1751 + }, + { + "epoch": 0.53179541660343, + "grad_norm": 0.459602415561676, + "learning_rate": 9.11815328541055e-05, + "loss": 0.918, + "step": 1752 + }, + { + "epoch": 0.5320989528001214, + "grad_norm": 0.4160063564777374, + "learning_rate": 9.11764705882353e-05, + "loss": 1.8438, + "step": 1753 + }, + { + "epoch": 0.5324024889968129, + "grad_norm": 0.44720131158828735, + "learning_rate": 9.11714083223651e-05, + "loss": 1.6503, + "step": 1754 + }, + { + "epoch": 0.5327060251935043, + "grad_norm": 0.35455620288848877, + "learning_rate": 9.116634605649489e-05, + "loss": 2.0683, + "step": 1755 + }, + { + "epoch": 0.5330095613901957, + "grad_norm": 0.3938636779785156, + "learning_rate": 9.116128379062468e-05, + "loss": 1.8191, + "step": 1756 + }, + { + "epoch": 0.5333130975868873, + "grad_norm": 0.38144779205322266, + "learning_rate": 9.115622152475448e-05, + "loss": 1.4855, + "step": 1757 + }, + { + "epoch": 0.5336166337835787, + "grad_norm": 0.3418583571910858, + "learning_rate": 9.115115925888427e-05, + "loss": 1.8684, + "step": 1758 + }, + { + "epoch": 0.5339201699802701, + "grad_norm": 0.3342360854148865, + "learning_rate": 9.114609699301407e-05, + "loss": 1.7817, + "step": 1759 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.4178410768508911, + "learning_rate": 9.114103472714388e-05, + "loss": 1.848, + "step": 1760 + }, + { + "epoch": 0.534527242373653, + "grad_norm": 0.37378084659576416, + "learning_rate": 9.113597246127367e-05, + "loss": 2.1967, + "step": 1761 + }, + { + "epoch": 0.5348307785703446, + "grad_norm": 0.33370524644851685, + "learning_rate": 9.113091019540347e-05, + "loss": 1.9943, + "step": 1762 + }, + { + "epoch": 0.535134314767036, + "grad_norm": 0.3402559757232666, + "learning_rate": 9.112584792953326e-05, + "loss": 1.7164, + "step": 1763 + }, + { + "epoch": 0.5354378509637274, + "grad_norm": 0.3962159752845764, + "learning_rate": 9.112078566366306e-05, + "loss": 1.8821, + "step": 1764 + }, + { + "epoch": 0.5357413871604189, + "grad_norm": 0.4659918546676636, + "learning_rate": 9.111572339779285e-05, + "loss": 1.7065, + "step": 1765 + }, + { + "epoch": 0.5360449233571103, + "grad_norm": 0.38673698902130127, + "learning_rate": 9.111066113192265e-05, + "loss": 1.8969, + "step": 1766 + }, + { + "epoch": 0.5363484595538018, + "grad_norm": 0.3595302999019623, + "learning_rate": 9.110559886605244e-05, + "loss": 1.881, + "step": 1767 + }, + { + "epoch": 0.5366519957504933, + "grad_norm": 0.4756614565849304, + "learning_rate": 9.110053660018225e-05, + "loss": 1.9395, + "step": 1768 + }, + { + "epoch": 0.5369555319471847, + "grad_norm": 0.36729127168655396, + "learning_rate": 9.109547433431204e-05, + "loss": 2.0762, + "step": 1769 + }, + { + "epoch": 0.5372590681438761, + "grad_norm": 0.5436307191848755, + "learning_rate": 9.109041206844184e-05, + "loss": 1.9971, + "step": 1770 + }, + { + "epoch": 0.5375626043405676, + "grad_norm": 0.42176029086112976, + "learning_rate": 9.108534980257165e-05, + "loss": 1.5326, + "step": 1771 + }, + { + "epoch": 0.5378661405372591, + "grad_norm": 0.6235511302947998, + "learning_rate": 9.108028753670144e-05, + "loss": 1.8906, + "step": 1772 + }, + { + "epoch": 0.5381696767339506, + "grad_norm": 0.42510315775871277, + "learning_rate": 9.107522527083124e-05, + "loss": 1.7236, + "step": 1773 + }, + { + "epoch": 0.538473212930642, + "grad_norm": 0.4418346583843231, + "learning_rate": 9.107016300496103e-05, + "loss": 1.9227, + "step": 1774 + }, + { + "epoch": 0.5387767491273334, + "grad_norm": 0.9422191977500916, + "learning_rate": 9.106510073909083e-05, + "loss": 1.7426, + "step": 1775 + }, + { + "epoch": 0.5390802853240249, + "grad_norm": 0.44353923201560974, + "learning_rate": 9.106003847322062e-05, + "loss": 2.0377, + "step": 1776 + }, + { + "epoch": 0.5393838215207164, + "grad_norm": 0.457926481962204, + "learning_rate": 9.105497620735042e-05, + "loss": 1.9583, + "step": 1777 + }, + { + "epoch": 0.5396873577174078, + "grad_norm": 0.3857896327972412, + "learning_rate": 9.104991394148021e-05, + "loss": 1.4618, + "step": 1778 + }, + { + "epoch": 0.5399908939140993, + "grad_norm": 0.4202859401702881, + "learning_rate": 9.104485167561e-05, + "loss": 1.7507, + "step": 1779 + }, + { + "epoch": 0.5402944301107907, + "grad_norm": 0.3665039837360382, + "learning_rate": 9.10397894097398e-05, + "loss": 1.8576, + "step": 1780 + }, + { + "epoch": 0.5405979663074821, + "grad_norm": 0.39893728494644165, + "learning_rate": 9.103472714386961e-05, + "loss": 1.81, + "step": 1781 + }, + { + "epoch": 0.5409015025041736, + "grad_norm": 2.199347972869873, + "learning_rate": 9.10296648779994e-05, + "loss": 1.4915, + "step": 1782 + }, + { + "epoch": 0.5412050387008651, + "grad_norm": 0.4976440966129303, + "learning_rate": 9.10246026121292e-05, + "loss": 1.6961, + "step": 1783 + }, + { + "epoch": 0.5415085748975565, + "grad_norm": 0.4084802269935608, + "learning_rate": 9.101954034625899e-05, + "loss": 1.4498, + "step": 1784 + }, + { + "epoch": 0.541812111094248, + "grad_norm": 0.37160369753837585, + "learning_rate": 9.101447808038879e-05, + "loss": 1.8383, + "step": 1785 + }, + { + "epoch": 0.5421156472909394, + "grad_norm": 0.4095883071422577, + "learning_rate": 9.100941581451858e-05, + "loss": 1.7035, + "step": 1786 + }, + { + "epoch": 0.5424191834876309, + "grad_norm": 0.3713209927082062, + "learning_rate": 9.100435354864838e-05, + "loss": 1.8924, + "step": 1787 + }, + { + "epoch": 0.5427227196843224, + "grad_norm": 0.465432733297348, + "learning_rate": 9.099929128277817e-05, + "loss": 2.1334, + "step": 1788 + }, + { + "epoch": 0.5430262558810138, + "grad_norm": 0.4591209590435028, + "learning_rate": 9.099422901690797e-05, + "loss": 1.9845, + "step": 1789 + }, + { + "epoch": 0.5433297920777053, + "grad_norm": 0.45076972246170044, + "learning_rate": 9.098916675103777e-05, + "loss": 1.7297, + "step": 1790 + }, + { + "epoch": 0.5436333282743967, + "grad_norm": 0.44921204447746277, + "learning_rate": 9.098410448516757e-05, + "loss": 1.9707, + "step": 1791 + }, + { + "epoch": 0.5439368644710881, + "grad_norm": 0.3970228135585785, + "learning_rate": 9.097904221929736e-05, + "loss": 2.024, + "step": 1792 + }, + { + "epoch": 0.5442404006677797, + "grad_norm": 0.4587130546569824, + "learning_rate": 9.097397995342716e-05, + "loss": 1.5426, + "step": 1793 + }, + { + "epoch": 0.5445439368644711, + "grad_norm": 0.4152527153491974, + "learning_rate": 9.096891768755695e-05, + "loss": 1.9575, + "step": 1794 + }, + { + "epoch": 0.5448474730611625, + "grad_norm": 0.3973013758659363, + "learning_rate": 9.096385542168675e-05, + "loss": 2.0246, + "step": 1795 + }, + { + "epoch": 0.545151009257854, + "grad_norm": 0.3950592875480652, + "learning_rate": 9.095879315581654e-05, + "loss": 1.9213, + "step": 1796 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.4187184274196625, + "learning_rate": 9.095373088994634e-05, + "loss": 1.8801, + "step": 1797 + }, + { + "epoch": 0.545758081651237, + "grad_norm": 0.43511682748794556, + "learning_rate": 9.094866862407613e-05, + "loss": 1.4022, + "step": 1798 + }, + { + "epoch": 0.5460616178479284, + "grad_norm": 0.40607360005378723, + "learning_rate": 9.094360635820594e-05, + "loss": 1.9416, + "step": 1799 + }, + { + "epoch": 0.5463651540446198, + "grad_norm": 0.3357563018798828, + "learning_rate": 9.093854409233574e-05, + "loss": 1.8906, + "step": 1800 + }, + { + "epoch": 0.5466686902413113, + "grad_norm": 0.3839071989059448, + "learning_rate": 9.093348182646553e-05, + "loss": 1.7918, + "step": 1801 + }, + { + "epoch": 0.5469722264380027, + "grad_norm": 0.3882817029953003, + "learning_rate": 9.092841956059533e-05, + "loss": 1.6346, + "step": 1802 + }, + { + "epoch": 0.5472757626346942, + "grad_norm": 0.4323276877403259, + "learning_rate": 9.092335729472512e-05, + "loss": 1.7346, + "step": 1803 + }, + { + "epoch": 0.5475792988313857, + "grad_norm": 0.39711809158325195, + "learning_rate": 9.091829502885492e-05, + "loss": 1.8855, + "step": 1804 + }, + { + "epoch": 0.5478828350280771, + "grad_norm": 0.4660872519016266, + "learning_rate": 9.091323276298471e-05, + "loss": 1.1871, + "step": 1805 + }, + { + "epoch": 0.5481863712247685, + "grad_norm": 0.45804888010025024, + "learning_rate": 9.09081704971145e-05, + "loss": 2.1685, + "step": 1806 + }, + { + "epoch": 0.54848990742146, + "grad_norm": 0.5922791361808777, + "learning_rate": 9.09031082312443e-05, + "loss": 1.8701, + "step": 1807 + }, + { + "epoch": 0.5487934436181514, + "grad_norm": 0.43038979172706604, + "learning_rate": 9.08980459653741e-05, + "loss": 2.1007, + "step": 1808 + }, + { + "epoch": 0.549096979814843, + "grad_norm": 0.3624688684940338, + "learning_rate": 9.08929836995039e-05, + "loss": 1.7411, + "step": 1809 + }, + { + "epoch": 0.5494005160115344, + "grad_norm": 0.40898412466049194, + "learning_rate": 9.08879214336337e-05, + "loss": 1.9381, + "step": 1810 + }, + { + "epoch": 0.5497040522082258, + "grad_norm": 0.45767003297805786, + "learning_rate": 9.088285916776349e-05, + "loss": 1.8295, + "step": 1811 + }, + { + "epoch": 0.5500075884049173, + "grad_norm": 0.41230660676956177, + "learning_rate": 9.08777969018933e-05, + "loss": 1.7066, + "step": 1812 + }, + { + "epoch": 0.5503111246016087, + "grad_norm": 0.6730133891105652, + "learning_rate": 9.08727346360231e-05, + "loss": 2.2395, + "step": 1813 + }, + { + "epoch": 0.5506146607983002, + "grad_norm": 0.39757731556892395, + "learning_rate": 9.086767237015289e-05, + "loss": 1.9659, + "step": 1814 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.48182088136672974, + "learning_rate": 9.086261010428269e-05, + "loss": 1.7596, + "step": 1815 + }, + { + "epoch": 0.5512217331916831, + "grad_norm": 0.4225050210952759, + "learning_rate": 9.085754783841248e-05, + "loss": 1.6582, + "step": 1816 + }, + { + "epoch": 0.5515252693883745, + "grad_norm": 0.40362295508384705, + "learning_rate": 9.085248557254227e-05, + "loss": 1.8002, + "step": 1817 + }, + { + "epoch": 0.551828805585066, + "grad_norm": 0.4283868968486786, + "learning_rate": 9.084742330667207e-05, + "loss": 2.0566, + "step": 1818 + }, + { + "epoch": 0.5521323417817575, + "grad_norm": 0.3864719569683075, + "learning_rate": 9.084236104080186e-05, + "loss": 1.7037, + "step": 1819 + }, + { + "epoch": 0.552435877978449, + "grad_norm": 0.45380616188049316, + "learning_rate": 9.083729877493167e-05, + "loss": 1.7841, + "step": 1820 + }, + { + "epoch": 0.5527394141751404, + "grad_norm": 0.42916885018348694, + "learning_rate": 9.083223650906147e-05, + "loss": 1.8914, + "step": 1821 + }, + { + "epoch": 0.5530429503718318, + "grad_norm": 0.4037598669528961, + "learning_rate": 9.082717424319126e-05, + "loss": 1.9326, + "step": 1822 + }, + { + "epoch": 0.5533464865685233, + "grad_norm": 0.37888360023498535, + "learning_rate": 9.082211197732106e-05, + "loss": 1.9225, + "step": 1823 + }, + { + "epoch": 0.5536500227652148, + "grad_norm": 0.44082072377204895, + "learning_rate": 9.081704971145085e-05, + "loss": 2.0965, + "step": 1824 + }, + { + "epoch": 0.5539535589619062, + "grad_norm": 0.40458253026008606, + "learning_rate": 9.081198744558065e-05, + "loss": 1.9156, + "step": 1825 + }, + { + "epoch": 0.5542570951585977, + "grad_norm": 0.648024320602417, + "learning_rate": 9.080692517971044e-05, + "loss": 1.6612, + "step": 1826 + }, + { + "epoch": 0.5545606313552891, + "grad_norm": 0.38878655433654785, + "learning_rate": 9.080186291384024e-05, + "loss": 1.9156, + "step": 1827 + }, + { + "epoch": 0.5548641675519805, + "grad_norm": 0.3615175187587738, + "learning_rate": 9.079680064797003e-05, + "loss": 1.9214, + "step": 1828 + }, + { + "epoch": 0.5551677037486721, + "grad_norm": 0.34867003560066223, + "learning_rate": 9.079173838209984e-05, + "loss": 1.906, + "step": 1829 + }, + { + "epoch": 0.5554712399453635, + "grad_norm": 0.6473682522773743, + "learning_rate": 9.078667611622963e-05, + "loss": 1.6777, + "step": 1830 + }, + { + "epoch": 0.555774776142055, + "grad_norm": 0.4099821150302887, + "learning_rate": 9.078161385035943e-05, + "loss": 2.1986, + "step": 1831 + }, + { + "epoch": 0.5560783123387464, + "grad_norm": 0.3992425799369812, + "learning_rate": 9.077655158448922e-05, + "loss": 2.0874, + "step": 1832 + }, + { + "epoch": 0.5563818485354378, + "grad_norm": 0.3562420904636383, + "learning_rate": 9.077148931861902e-05, + "loss": 1.8174, + "step": 1833 + }, + { + "epoch": 0.5566853847321293, + "grad_norm": 0.45232492685317993, + "learning_rate": 9.076642705274881e-05, + "loss": 2.0863, + "step": 1834 + }, + { + "epoch": 0.5569889209288208, + "grad_norm": 0.39387455582618713, + "learning_rate": 9.076136478687861e-05, + "loss": 1.8224, + "step": 1835 + }, + { + "epoch": 0.5572924571255122, + "grad_norm": 0.35372141003608704, + "learning_rate": 9.07563025210084e-05, + "loss": 1.5546, + "step": 1836 + }, + { + "epoch": 0.5575959933222037, + "grad_norm": 0.4068455100059509, + "learning_rate": 9.07512402551382e-05, + "loss": 1.6266, + "step": 1837 + }, + { + "epoch": 0.5578995295188951, + "grad_norm": 0.39574089646339417, + "learning_rate": 9.0746177989268e-05, + "loss": 2.0823, + "step": 1838 + }, + { + "epoch": 0.5582030657155865, + "grad_norm": 1.1845453977584839, + "learning_rate": 9.07411157233978e-05, + "loss": 1.6966, + "step": 1839 + }, + { + "epoch": 0.5585066019122781, + "grad_norm": 0.39268460869789124, + "learning_rate": 9.07360534575276e-05, + "loss": 1.1071, + "step": 1840 + }, + { + "epoch": 0.5588101381089695, + "grad_norm": 0.4749743640422821, + "learning_rate": 9.073099119165739e-05, + "loss": 1.9787, + "step": 1841 + }, + { + "epoch": 0.559113674305661, + "grad_norm": 0.4099438786506653, + "learning_rate": 9.072592892578719e-05, + "loss": 1.952, + "step": 1842 + }, + { + "epoch": 0.5594172105023524, + "grad_norm": 0.4282529354095459, + "learning_rate": 9.072086665991698e-05, + "loss": 1.9985, + "step": 1843 + }, + { + "epoch": 0.5597207466990438, + "grad_norm": 0.41518470644950867, + "learning_rate": 9.071580439404678e-05, + "loss": 1.6999, + "step": 1844 + }, + { + "epoch": 0.5600242828957354, + "grad_norm": 0.4059050381183624, + "learning_rate": 9.071074212817657e-05, + "loss": 1.9737, + "step": 1845 + }, + { + "epoch": 0.5603278190924268, + "grad_norm": 0.3274436295032501, + "learning_rate": 9.070567986230636e-05, + "loss": 1.1522, + "step": 1846 + }, + { + "epoch": 0.5606313552891182, + "grad_norm": 0.4117715656757355, + "learning_rate": 9.070061759643616e-05, + "loss": 2.2128, + "step": 1847 + }, + { + "epoch": 0.5609348914858097, + "grad_norm": 1.530457854270935, + "learning_rate": 9.069555533056597e-05, + "loss": 2.2275, + "step": 1848 + }, + { + "epoch": 0.5612384276825011, + "grad_norm": 1.6292579174041748, + "learning_rate": 9.069049306469576e-05, + "loss": 1.6642, + "step": 1849 + }, + { + "epoch": 0.5615419638791926, + "grad_norm": 0.4147336184978485, + "learning_rate": 9.068543079882556e-05, + "loss": 1.7811, + "step": 1850 + }, + { + "epoch": 0.5618455000758841, + "grad_norm": 0.4253292679786682, + "learning_rate": 9.068036853295535e-05, + "loss": 2.1084, + "step": 1851 + }, + { + "epoch": 0.5621490362725755, + "grad_norm": 0.3340885043144226, + "learning_rate": 9.067530626708515e-05, + "loss": 1.0015, + "step": 1852 + }, + { + "epoch": 0.5624525724692669, + "grad_norm": 0.34140780568122864, + "learning_rate": 9.067024400121494e-05, + "loss": 1.9773, + "step": 1853 + }, + { + "epoch": 0.5627561086659584, + "grad_norm": 0.48916199803352356, + "learning_rate": 9.066518173534474e-05, + "loss": 1.7743, + "step": 1854 + }, + { + "epoch": 0.5630596448626499, + "grad_norm": 0.43407005071640015, + "learning_rate": 9.066011946947453e-05, + "loss": 1.8134, + "step": 1855 + }, + { + "epoch": 0.5633631810593414, + "grad_norm": 1.257241129875183, + "learning_rate": 9.065505720360433e-05, + "loss": 1.9903, + "step": 1856 + }, + { + "epoch": 0.5636667172560328, + "grad_norm": 0.4004335105419159, + "learning_rate": 9.064999493773413e-05, + "loss": 1.9988, + "step": 1857 + }, + { + "epoch": 0.5639702534527242, + "grad_norm": 0.41307345032691956, + "learning_rate": 9.064493267186393e-05, + "loss": 1.9789, + "step": 1858 + }, + { + "epoch": 0.5642737896494157, + "grad_norm": 0.41875752806663513, + "learning_rate": 9.063987040599374e-05, + "loss": 1.8535, + "step": 1859 + }, + { + "epoch": 0.5645773258461072, + "grad_norm": 0.4912898540496826, + "learning_rate": 9.063480814012353e-05, + "loss": 2.3529, + "step": 1860 + }, + { + "epoch": 0.5648808620427986, + "grad_norm": 0.4265078604221344, + "learning_rate": 9.062974587425333e-05, + "loss": 2.0698, + "step": 1861 + }, + { + "epoch": 0.5651843982394901, + "grad_norm": 0.3786260187625885, + "learning_rate": 9.062468360838312e-05, + "loss": 1.818, + "step": 1862 + }, + { + "epoch": 0.5654879344361815, + "grad_norm": 0.3665534257888794, + "learning_rate": 9.061962134251292e-05, + "loss": 1.9464, + "step": 1863 + }, + { + "epoch": 0.5657914706328729, + "grad_norm": 0.4516305923461914, + "learning_rate": 9.061455907664271e-05, + "loss": 1.7718, + "step": 1864 + }, + { + "epoch": 0.5660950068295644, + "grad_norm": 1.0637644529342651, + "learning_rate": 9.06094968107725e-05, + "loss": 1.8881, + "step": 1865 + }, + { + "epoch": 0.5663985430262559, + "grad_norm": 0.41039812564849854, + "learning_rate": 9.06044345449023e-05, + "loss": 2.0346, + "step": 1866 + }, + { + "epoch": 0.5667020792229474, + "grad_norm": 0.40830013155937195, + "learning_rate": 9.05993722790321e-05, + "loss": 1.6864, + "step": 1867 + }, + { + "epoch": 0.5670056154196388, + "grad_norm": 0.37757718563079834, + "learning_rate": 9.05943100131619e-05, + "loss": 1.7925, + "step": 1868 + }, + { + "epoch": 0.5673091516163302, + "grad_norm": 0.45366227626800537, + "learning_rate": 9.05892477472917e-05, + "loss": 1.7869, + "step": 1869 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.4220414459705353, + "learning_rate": 9.05841854814215e-05, + "loss": 1.932, + "step": 1870 + }, + { + "epoch": 0.5679162240097132, + "grad_norm": 0.4413476884365082, + "learning_rate": 9.057912321555129e-05, + "loss": 1.5842, + "step": 1871 + }, + { + "epoch": 0.5682197602064046, + "grad_norm": 0.40240782499313354, + "learning_rate": 9.057406094968108e-05, + "loss": 1.9477, + "step": 1872 + }, + { + "epoch": 0.5685232964030961, + "grad_norm": 0.4332951605319977, + "learning_rate": 9.056899868381088e-05, + "loss": 1.8664, + "step": 1873 + }, + { + "epoch": 0.5688268325997875, + "grad_norm": 0.3841226398944855, + "learning_rate": 9.056393641794067e-05, + "loss": 2.3058, + "step": 1874 + }, + { + "epoch": 0.5691303687964789, + "grad_norm": 0.3937263488769531, + "learning_rate": 9.055887415207047e-05, + "loss": 1.683, + "step": 1875 + }, + { + "epoch": 0.5694339049931705, + "grad_norm": 0.33709925413131714, + "learning_rate": 9.055381188620026e-05, + "loss": 1.8616, + "step": 1876 + }, + { + "epoch": 0.5697374411898619, + "grad_norm": 0.3934507668018341, + "learning_rate": 9.054874962033007e-05, + "loss": 1.6839, + "step": 1877 + }, + { + "epoch": 0.5700409773865533, + "grad_norm": 0.4386683702468872, + "learning_rate": 9.054368735445987e-05, + "loss": 1.8068, + "step": 1878 + }, + { + "epoch": 0.5703445135832448, + "grad_norm": 0.4416390657424927, + "learning_rate": 9.053862508858966e-05, + "loss": 1.6858, + "step": 1879 + }, + { + "epoch": 0.5706480497799362, + "grad_norm": 0.4287014901638031, + "learning_rate": 9.053356282271946e-05, + "loss": 1.8889, + "step": 1880 + }, + { + "epoch": 0.5709515859766278, + "grad_norm": 0.4297000765800476, + "learning_rate": 9.052850055684925e-05, + "loss": 1.8281, + "step": 1881 + }, + { + "epoch": 0.5712551221733192, + "grad_norm": 0.48270586133003235, + "learning_rate": 9.052343829097905e-05, + "loss": 1.6692, + "step": 1882 + }, + { + "epoch": 0.5715586583700106, + "grad_norm": 0.44133251905441284, + "learning_rate": 9.051837602510884e-05, + "loss": 1.8359, + "step": 1883 + }, + { + "epoch": 0.5718621945667021, + "grad_norm": 0.5127750039100647, + "learning_rate": 9.051331375923863e-05, + "loss": 1.9437, + "step": 1884 + }, + { + "epoch": 0.5721657307633935, + "grad_norm": 0.4890953600406647, + "learning_rate": 9.050825149336843e-05, + "loss": 1.6396, + "step": 1885 + }, + { + "epoch": 0.572469266960085, + "grad_norm": 0.36201316118240356, + "learning_rate": 9.050318922749822e-05, + "loss": 1.6985, + "step": 1886 + }, + { + "epoch": 0.5727728031567765, + "grad_norm": 0.3880859911441803, + "learning_rate": 9.049812696162803e-05, + "loss": 1.7916, + "step": 1887 + }, + { + "epoch": 0.5730763393534679, + "grad_norm": 0.500619649887085, + "learning_rate": 9.049306469575783e-05, + "loss": 1.826, + "step": 1888 + }, + { + "epoch": 0.5733798755501593, + "grad_norm": 0.764751672744751, + "learning_rate": 9.048800242988762e-05, + "loss": 1.5406, + "step": 1889 + }, + { + "epoch": 0.5736834117468508, + "grad_norm": 0.4573342502117157, + "learning_rate": 9.048294016401742e-05, + "loss": 1.6461, + "step": 1890 + }, + { + "epoch": 0.5739869479435422, + "grad_norm": 0.5972601175308228, + "learning_rate": 9.047787789814721e-05, + "loss": 2.3081, + "step": 1891 + }, + { + "epoch": 0.5742904841402338, + "grad_norm": 0.4419214129447937, + "learning_rate": 9.0472815632277e-05, + "loss": 1.8907, + "step": 1892 + }, + { + "epoch": 0.5745940203369252, + "grad_norm": 0.3364506959915161, + "learning_rate": 9.04677533664068e-05, + "loss": 1.598, + "step": 1893 + }, + { + "epoch": 0.5748975565336166, + "grad_norm": 0.41443008184432983, + "learning_rate": 9.04626911005366e-05, + "loss": 1.91, + "step": 1894 + }, + { + "epoch": 0.5752010927303081, + "grad_norm": 0.3931877315044403, + "learning_rate": 9.045762883466639e-05, + "loss": 2.0265, + "step": 1895 + }, + { + "epoch": 0.5755046289269995, + "grad_norm": 0.3768281042575836, + "learning_rate": 9.04525665687962e-05, + "loss": 2.0458, + "step": 1896 + }, + { + "epoch": 0.575808165123691, + "grad_norm": 0.726582407951355, + "learning_rate": 9.0447504302926e-05, + "loss": 1.5708, + "step": 1897 + }, + { + "epoch": 0.5761117013203825, + "grad_norm": 0.4031538665294647, + "learning_rate": 9.044244203705579e-05, + "loss": 1.6248, + "step": 1898 + }, + { + "epoch": 0.5764152375170739, + "grad_norm": 0.3605407476425171, + "learning_rate": 9.043737977118558e-05, + "loss": 1.9504, + "step": 1899 + }, + { + "epoch": 0.5767187737137653, + "grad_norm": 0.3802354633808136, + "learning_rate": 9.043231750531538e-05, + "loss": 2.0487, + "step": 1900 + }, + { + "epoch": 0.5770223099104568, + "grad_norm": 0.41240641474723816, + "learning_rate": 9.042725523944519e-05, + "loss": 1.9202, + "step": 1901 + }, + { + "epoch": 0.5773258461071483, + "grad_norm": 0.36771708726882935, + "learning_rate": 9.042219297357498e-05, + "loss": 2.2273, + "step": 1902 + }, + { + "epoch": 0.5776293823038398, + "grad_norm": 0.4182611405849457, + "learning_rate": 9.041713070770478e-05, + "loss": 1.8689, + "step": 1903 + }, + { + "epoch": 0.5779329185005312, + "grad_norm": 0.39633724093437195, + "learning_rate": 9.041206844183457e-05, + "loss": 1.9744, + "step": 1904 + }, + { + "epoch": 0.5782364546972226, + "grad_norm": 0.3978392481803894, + "learning_rate": 9.040700617596437e-05, + "loss": 1.6662, + "step": 1905 + }, + { + "epoch": 0.5785399908939141, + "grad_norm": 0.3734360635280609, + "learning_rate": 9.040194391009416e-05, + "loss": 1.0948, + "step": 1906 + }, + { + "epoch": 0.5788435270906056, + "grad_norm": 0.403392493724823, + "learning_rate": 9.039688164422397e-05, + "loss": 2.0251, + "step": 1907 + }, + { + "epoch": 0.579147063287297, + "grad_norm": 0.350067138671875, + "learning_rate": 9.039181937835376e-05, + "loss": 1.543, + "step": 1908 + }, + { + "epoch": 0.5794505994839885, + "grad_norm": 0.4273326098918915, + "learning_rate": 9.038675711248356e-05, + "loss": 1.8694, + "step": 1909 + }, + { + "epoch": 0.5797541356806799, + "grad_norm": 0.4815780222415924, + "learning_rate": 9.038169484661335e-05, + "loss": 1.9565, + "step": 1910 + }, + { + "epoch": 0.5800576718773713, + "grad_norm": 0.5379179120063782, + "learning_rate": 9.037663258074315e-05, + "loss": 1.9631, + "step": 1911 + }, + { + "epoch": 0.5803612080740629, + "grad_norm": 0.47738704085350037, + "learning_rate": 9.037157031487294e-05, + "loss": 1.878, + "step": 1912 + }, + { + "epoch": 0.5806647442707543, + "grad_norm": 0.426543653011322, + "learning_rate": 9.036650804900274e-05, + "loss": 2.0392, + "step": 1913 + }, + { + "epoch": 0.5809682804674458, + "grad_norm": 0.38239404559135437, + "learning_rate": 9.036144578313253e-05, + "loss": 1.9695, + "step": 1914 + }, + { + "epoch": 0.5812718166641372, + "grad_norm": 0.40093934535980225, + "learning_rate": 9.035638351726233e-05, + "loss": 1.9695, + "step": 1915 + }, + { + "epoch": 0.5815753528608286, + "grad_norm": 0.3865903317928314, + "learning_rate": 9.035132125139214e-05, + "loss": 1.7925, + "step": 1916 + }, + { + "epoch": 0.58187888905752, + "grad_norm": 0.6183242201805115, + "learning_rate": 9.034625898552193e-05, + "loss": 1.853, + "step": 1917 + }, + { + "epoch": 0.5821824252542116, + "grad_norm": 0.4869506061077118, + "learning_rate": 9.034119671965173e-05, + "loss": 1.9418, + "step": 1918 + }, + { + "epoch": 0.582485961450903, + "grad_norm": 0.40212881565093994, + "learning_rate": 9.033613445378152e-05, + "loss": 2.0259, + "step": 1919 + }, + { + "epoch": 0.5827894976475945, + "grad_norm": 0.7224326729774475, + "learning_rate": 9.033107218791131e-05, + "loss": 1.933, + "step": 1920 + }, + { + "epoch": 0.5830930338442859, + "grad_norm": 0.4369768500328064, + "learning_rate": 9.032600992204111e-05, + "loss": 1.7931, + "step": 1921 + }, + { + "epoch": 0.5833965700409773, + "grad_norm": 0.3920018672943115, + "learning_rate": 9.03209476561709e-05, + "loss": 1.913, + "step": 1922 + }, + { + "epoch": 0.5837001062376689, + "grad_norm": 0.5076978206634521, + "learning_rate": 9.03158853903007e-05, + "loss": 2.0081, + "step": 1923 + }, + { + "epoch": 0.5840036424343603, + "grad_norm": 0.38379955291748047, + "learning_rate": 9.03108231244305e-05, + "loss": 2.0153, + "step": 1924 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.4367254376411438, + "learning_rate": 9.030576085856029e-05, + "loss": 2.1004, + "step": 1925 + }, + { + "epoch": 0.5846107148277432, + "grad_norm": 0.37425291538238525, + "learning_rate": 9.03006985926901e-05, + "loss": 1.6777, + "step": 1926 + }, + { + "epoch": 0.5849142510244346, + "grad_norm": 0.37925392389297485, + "learning_rate": 9.029563632681989e-05, + "loss": 2.1164, + "step": 1927 + }, + { + "epoch": 0.5852177872211262, + "grad_norm": 0.41369903087615967, + "learning_rate": 9.029057406094969e-05, + "loss": 1.7252, + "step": 1928 + }, + { + "epoch": 0.5855213234178176, + "grad_norm": 0.3528081476688385, + "learning_rate": 9.028551179507948e-05, + "loss": 1.9181, + "step": 1929 + }, + { + "epoch": 0.585824859614509, + "grad_norm": 0.38274556398391724, + "learning_rate": 9.028044952920928e-05, + "loss": 1.8585, + "step": 1930 + }, + { + "epoch": 0.5861283958112005, + "grad_norm": 0.4036407768726349, + "learning_rate": 9.027538726333907e-05, + "loss": 1.9679, + "step": 1931 + }, + { + "epoch": 0.5864319320078919, + "grad_norm": 0.34841248393058777, + "learning_rate": 9.027032499746887e-05, + "loss": 1.8571, + "step": 1932 + }, + { + "epoch": 0.5867354682045834, + "grad_norm": 0.3821954131126404, + "learning_rate": 9.026526273159866e-05, + "loss": 1.8175, + "step": 1933 + }, + { + "epoch": 0.5870390044012749, + "grad_norm": 0.3724253475666046, + "learning_rate": 9.026020046572846e-05, + "loss": 1.9549, + "step": 1934 + }, + { + "epoch": 0.5873425405979663, + "grad_norm": 0.40494081377983093, + "learning_rate": 9.025513819985826e-05, + "loss": 2.0013, + "step": 1935 + }, + { + "epoch": 0.5876460767946577, + "grad_norm": 0.7746275663375854, + "learning_rate": 9.025007593398806e-05, + "loss": 1.6739, + "step": 1936 + }, + { + "epoch": 0.5879496129913492, + "grad_norm": 0.34239932894706726, + "learning_rate": 9.024501366811785e-05, + "loss": 1.1999, + "step": 1937 + }, + { + "epoch": 0.5882531491880407, + "grad_norm": 0.40239185094833374, + "learning_rate": 9.023995140224765e-05, + "loss": 1.956, + "step": 1938 + }, + { + "epoch": 0.5885566853847322, + "grad_norm": 0.4756642282009125, + "learning_rate": 9.023488913637744e-05, + "loss": 1.5705, + "step": 1939 + }, + { + "epoch": 0.5888602215814236, + "grad_norm": 0.412263959646225, + "learning_rate": 9.022982687050724e-05, + "loss": 1.8187, + "step": 1940 + }, + { + "epoch": 0.589163757778115, + "grad_norm": 0.4178502857685089, + "learning_rate": 9.022476460463703e-05, + "loss": 1.8955, + "step": 1941 + }, + { + "epoch": 0.5894672939748065, + "grad_norm": 0.4619811475276947, + "learning_rate": 9.021970233876683e-05, + "loss": 1.9968, + "step": 1942 + }, + { + "epoch": 0.589770830171498, + "grad_norm": 0.42839181423187256, + "learning_rate": 9.021464007289662e-05, + "loss": 1.5708, + "step": 1943 + }, + { + "epoch": 0.5900743663681894, + "grad_norm": 0.4423038363456726, + "learning_rate": 9.020957780702643e-05, + "loss": 1.9684, + "step": 1944 + }, + { + "epoch": 0.5903779025648809, + "grad_norm": 0.3898191452026367, + "learning_rate": 9.020451554115623e-05, + "loss": 1.9497, + "step": 1945 + }, + { + "epoch": 0.5906814387615723, + "grad_norm": 0.701366662979126, + "learning_rate": 9.019945327528603e-05, + "loss": 1.7624, + "step": 1946 + }, + { + "epoch": 0.5909849749582637, + "grad_norm": 0.32581913471221924, + "learning_rate": 9.019439100941583e-05, + "loss": 2.1484, + "step": 1947 + }, + { + "epoch": 0.5912885111549552, + "grad_norm": 0.4372369050979614, + "learning_rate": 9.018932874354562e-05, + "loss": 1.6075, + "step": 1948 + }, + { + "epoch": 0.5915920473516467, + "grad_norm": 0.39428946375846863, + "learning_rate": 9.018426647767542e-05, + "loss": 1.6004, + "step": 1949 + }, + { + "epoch": 0.5918955835483382, + "grad_norm": 0.3934183120727539, + "learning_rate": 9.017920421180521e-05, + "loss": 1.9358, + "step": 1950 + }, + { + "epoch": 0.5921991197450296, + "grad_norm": 0.42696380615234375, + "learning_rate": 9.017414194593501e-05, + "loss": 1.9646, + "step": 1951 + }, + { + "epoch": 0.592502655941721, + "grad_norm": 0.38243913650512695, + "learning_rate": 9.01690796800648e-05, + "loss": 1.9946, + "step": 1952 + }, + { + "epoch": 0.5928061921384125, + "grad_norm": 0.4068431556224823, + "learning_rate": 9.01640174141946e-05, + "loss": 2.0786, + "step": 1953 + }, + { + "epoch": 0.593109728335104, + "grad_norm": 0.44560736417770386, + "learning_rate": 9.015895514832439e-05, + "loss": 1.7514, + "step": 1954 + }, + { + "epoch": 0.5934132645317954, + "grad_norm": 0.4143114686012268, + "learning_rate": 9.01538928824542e-05, + "loss": 2.083, + "step": 1955 + }, + { + "epoch": 0.5937168007284869, + "grad_norm": 0.45947229862213135, + "learning_rate": 9.0148830616584e-05, + "loss": 1.9313, + "step": 1956 + }, + { + "epoch": 0.5940203369251783, + "grad_norm": 2.7487032413482666, + "learning_rate": 9.014376835071379e-05, + "loss": 1.7648, + "step": 1957 + }, + { + "epoch": 0.5943238731218697, + "grad_norm": 0.3856576979160309, + "learning_rate": 9.013870608484358e-05, + "loss": 2.1141, + "step": 1958 + }, + { + "epoch": 0.5946274093185613, + "grad_norm": 0.3741602897644043, + "learning_rate": 9.013364381897338e-05, + "loss": 1.6458, + "step": 1959 + }, + { + "epoch": 0.5949309455152527, + "grad_norm": 0.3791872262954712, + "learning_rate": 9.012858155310317e-05, + "loss": 1.433, + "step": 1960 + }, + { + "epoch": 0.5952344817119442, + "grad_norm": 0.32848575711250305, + "learning_rate": 9.012351928723297e-05, + "loss": 1.6748, + "step": 1961 + }, + { + "epoch": 0.5955380179086356, + "grad_norm": 0.4328818917274475, + "learning_rate": 9.011845702136276e-05, + "loss": 1.8309, + "step": 1962 + }, + { + "epoch": 0.595841554105327, + "grad_norm": 0.40931710600852966, + "learning_rate": 9.011339475549256e-05, + "loss": 2.0837, + "step": 1963 + }, + { + "epoch": 0.5961450903020186, + "grad_norm": 0.3625456690788269, + "learning_rate": 9.010833248962235e-05, + "loss": 1.8895, + "step": 1964 + }, + { + "epoch": 0.59644862649871, + "grad_norm": 0.33840253949165344, + "learning_rate": 9.010327022375216e-05, + "loss": 1.8706, + "step": 1965 + }, + { + "epoch": 0.5967521626954014, + "grad_norm": 0.38374340534210205, + "learning_rate": 9.009820795788196e-05, + "loss": 1.8782, + "step": 1966 + }, + { + "epoch": 0.5970556988920929, + "grad_norm": 0.41515031456947327, + "learning_rate": 9.009314569201175e-05, + "loss": 1.7455, + "step": 1967 + }, + { + "epoch": 0.5973592350887843, + "grad_norm": 0.35676872730255127, + "learning_rate": 9.008808342614155e-05, + "loss": 1.7706, + "step": 1968 + }, + { + "epoch": 0.5976627712854758, + "grad_norm": 0.4770854711532593, + "learning_rate": 9.008302116027134e-05, + "loss": 2.0954, + "step": 1969 + }, + { + "epoch": 0.5979663074821673, + "grad_norm": 0.3612794876098633, + "learning_rate": 9.007795889440114e-05, + "loss": 2.1938, + "step": 1970 + }, + { + "epoch": 0.5982698436788587, + "grad_norm": 0.5067920684814453, + "learning_rate": 9.007289662853093e-05, + "loss": 1.2096, + "step": 1971 + }, + { + "epoch": 0.5985733798755501, + "grad_norm": 0.4193328022956848, + "learning_rate": 9.006783436266073e-05, + "loss": 1.6632, + "step": 1972 + }, + { + "epoch": 0.5988769160722416, + "grad_norm": 0.41445595026016235, + "learning_rate": 9.006277209679052e-05, + "loss": 2.0237, + "step": 1973 + }, + { + "epoch": 0.599180452268933, + "grad_norm": 0.4083717167377472, + "learning_rate": 9.005770983092033e-05, + "loss": 2.1022, + "step": 1974 + }, + { + "epoch": 0.5994839884656246, + "grad_norm": 0.4897996485233307, + "learning_rate": 9.005264756505012e-05, + "loss": 1.6074, + "step": 1975 + }, + { + "epoch": 0.599787524662316, + "grad_norm": 0.46923205256462097, + "learning_rate": 9.004758529917992e-05, + "loss": 2.0915, + "step": 1976 + }, + { + "epoch": 0.6000910608590074, + "grad_norm": 0.37507691979408264, + "learning_rate": 9.004252303330971e-05, + "loss": 1.6793, + "step": 1977 + }, + { + "epoch": 0.6003945970556989, + "grad_norm": 0.3973737061023712, + "learning_rate": 9.003746076743951e-05, + "loss": 2.0935, + "step": 1978 + }, + { + "epoch": 0.6006981332523903, + "grad_norm": 0.40313783288002014, + "learning_rate": 9.00323985015693e-05, + "loss": 1.7405, + "step": 1979 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.36169835925102234, + "learning_rate": 9.00273362356991e-05, + "loss": 2.1843, + "step": 1980 + }, + { + "epoch": 0.6013052056457733, + "grad_norm": 0.41355371475219727, + "learning_rate": 9.002227396982889e-05, + "loss": 2.0969, + "step": 1981 + }, + { + "epoch": 0.6016087418424647, + "grad_norm": 0.42378634214401245, + "learning_rate": 9.001721170395869e-05, + "loss": 1.9652, + "step": 1982 + }, + { + "epoch": 0.6019122780391561, + "grad_norm": 0.42945531010627747, + "learning_rate": 9.00121494380885e-05, + "loss": 2.0625, + "step": 1983 + }, + { + "epoch": 0.6022158142358476, + "grad_norm": 0.5348070859909058, + "learning_rate": 9.000708717221829e-05, + "loss": 1.3785, + "step": 1984 + }, + { + "epoch": 0.6025193504325391, + "grad_norm": 0.35933446884155273, + "learning_rate": 9.000202490634809e-05, + "loss": 1.5254, + "step": 1985 + }, + { + "epoch": 0.6028228866292306, + "grad_norm": 0.42495015263557434, + "learning_rate": 8.999696264047788e-05, + "loss": 2.1669, + "step": 1986 + }, + { + "epoch": 0.603126422825922, + "grad_norm": 0.43792733550071716, + "learning_rate": 8.999190037460767e-05, + "loss": 1.8057, + "step": 1987 + }, + { + "epoch": 0.6034299590226134, + "grad_norm": 0.39334404468536377, + "learning_rate": 8.998683810873747e-05, + "loss": 1.8157, + "step": 1988 + }, + { + "epoch": 0.6037334952193049, + "grad_norm": 0.38974860310554504, + "learning_rate": 8.998177584286726e-05, + "loss": 1.8293, + "step": 1989 + }, + { + "epoch": 0.6040370314159964, + "grad_norm": 0.44241687655448914, + "learning_rate": 8.997671357699707e-05, + "loss": 2.2078, + "step": 1990 + }, + { + "epoch": 0.6043405676126878, + "grad_norm": 0.40700820088386536, + "learning_rate": 8.997165131112687e-05, + "loss": 1.9656, + "step": 1991 + }, + { + "epoch": 0.6046441038093793, + "grad_norm": 0.3992595076560974, + "learning_rate": 8.996658904525666e-05, + "loss": 1.9423, + "step": 1992 + }, + { + "epoch": 0.6049476400060707, + "grad_norm": 0.3922860622406006, + "learning_rate": 8.996152677938646e-05, + "loss": 2.1253, + "step": 1993 + }, + { + "epoch": 0.6052511762027621, + "grad_norm": 0.3843866288661957, + "learning_rate": 8.995646451351627e-05, + "loss": 2.0756, + "step": 1994 + }, + { + "epoch": 0.6055547123994537, + "grad_norm": 0.3822995722293854, + "learning_rate": 8.995140224764606e-05, + "loss": 1.9475, + "step": 1995 + }, + { + "epoch": 0.6058582485961451, + "grad_norm": 0.4001995325088501, + "learning_rate": 8.994633998177585e-05, + "loss": 1.9781, + "step": 1996 + }, + { + "epoch": 0.6061617847928366, + "grad_norm": 0.3775820732116699, + "learning_rate": 8.994127771590565e-05, + "loss": 1.7857, + "step": 1997 + }, + { + "epoch": 0.606465320989528, + "grad_norm": 0.4260796308517456, + "learning_rate": 8.993621545003544e-05, + "loss": 1.6416, + "step": 1998 + }, + { + "epoch": 0.6067688571862194, + "grad_norm": 0.39824166893959045, + "learning_rate": 8.993115318416524e-05, + "loss": 1.7657, + "step": 1999 + }, + { + "epoch": 0.6070723933829109, + "grad_norm": 0.46430447697639465, + "learning_rate": 8.992609091829503e-05, + "loss": 1.8802, + "step": 2000 + }, + { + "epoch": 0.6073759295796024, + "grad_norm": 0.4773789048194885, + "learning_rate": 8.992102865242483e-05, + "loss": 1.8786, + "step": 2001 + }, + { + "epoch": 0.6076794657762938, + "grad_norm": 0.4296311140060425, + "learning_rate": 8.991596638655462e-05, + "loss": 1.9457, + "step": 2002 + }, + { + "epoch": 0.6079830019729853, + "grad_norm": 0.41193845868110657, + "learning_rate": 8.991090412068442e-05, + "loss": 1.5856, + "step": 2003 + }, + { + "epoch": 0.6082865381696767, + "grad_norm": 0.43040478229522705, + "learning_rate": 8.990584185481423e-05, + "loss": 2.1432, + "step": 2004 + }, + { + "epoch": 0.6085900743663681, + "grad_norm": 0.5215789079666138, + "learning_rate": 8.990077958894402e-05, + "loss": 1.9584, + "step": 2005 + }, + { + "epoch": 0.6088936105630597, + "grad_norm": 0.4370077848434448, + "learning_rate": 8.989571732307382e-05, + "loss": 1.5612, + "step": 2006 + }, + { + "epoch": 0.6091971467597511, + "grad_norm": 0.4200492203235626, + "learning_rate": 8.989065505720361e-05, + "loss": 1.9493, + "step": 2007 + }, + { + "epoch": 0.6095006829564426, + "grad_norm": 0.39453452825546265, + "learning_rate": 8.98855927913334e-05, + "loss": 1.9529, + "step": 2008 + }, + { + "epoch": 0.609804219153134, + "grad_norm": 0.4478731155395508, + "learning_rate": 8.98805305254632e-05, + "loss": 2.0098, + "step": 2009 + }, + { + "epoch": 0.6101077553498254, + "grad_norm": 0.39515209197998047, + "learning_rate": 8.9875468259593e-05, + "loss": 1.958, + "step": 2010 + }, + { + "epoch": 0.610411291546517, + "grad_norm": 0.3660414516925812, + "learning_rate": 8.987040599372279e-05, + "loss": 1.9538, + "step": 2011 + }, + { + "epoch": 0.6107148277432084, + "grad_norm": 0.3517032861709595, + "learning_rate": 8.986534372785259e-05, + "loss": 1.8833, + "step": 2012 + }, + { + "epoch": 0.6110183639398998, + "grad_norm": 0.6502123475074768, + "learning_rate": 8.98602814619824e-05, + "loss": 2.1106, + "step": 2013 + }, + { + "epoch": 0.6113219001365913, + "grad_norm": 0.4674864709377289, + "learning_rate": 8.985521919611219e-05, + "loss": 1.8986, + "step": 2014 + }, + { + "epoch": 0.6116254363332827, + "grad_norm": 0.4143102467060089, + "learning_rate": 8.985015693024198e-05, + "loss": 1.7635, + "step": 2015 + }, + { + "epoch": 0.6119289725299742, + "grad_norm": 0.4329308867454529, + "learning_rate": 8.984509466437178e-05, + "loss": 1.648, + "step": 2016 + }, + { + "epoch": 0.6122325087266657, + "grad_norm": 0.34939324855804443, + "learning_rate": 8.984003239850157e-05, + "loss": 1.7641, + "step": 2017 + }, + { + "epoch": 0.6125360449233571, + "grad_norm": 0.4234546720981598, + "learning_rate": 8.983497013263137e-05, + "loss": 1.8691, + "step": 2018 + }, + { + "epoch": 0.6128395811200485, + "grad_norm": 0.7465669512748718, + "learning_rate": 8.982990786676116e-05, + "loss": 2.0573, + "step": 2019 + }, + { + "epoch": 0.61314311731674, + "grad_norm": 0.36259400844573975, + "learning_rate": 8.982484560089096e-05, + "loss": 2.0654, + "step": 2020 + }, + { + "epoch": 0.6134466535134315, + "grad_norm": 0.3918156623840332, + "learning_rate": 8.981978333502075e-05, + "loss": 2.1658, + "step": 2021 + }, + { + "epoch": 0.613750189710123, + "grad_norm": 0.3924868404865265, + "learning_rate": 8.981472106915056e-05, + "loss": 1.9306, + "step": 2022 + }, + { + "epoch": 0.6140537259068144, + "grad_norm": 0.7729107141494751, + "learning_rate": 8.980965880328035e-05, + "loss": 1.5911, + "step": 2023 + }, + { + "epoch": 0.6143572621035058, + "grad_norm": 0.4199913442134857, + "learning_rate": 8.980459653741015e-05, + "loss": 1.9833, + "step": 2024 + }, + { + "epoch": 0.6146607983001973, + "grad_norm": 0.40258511900901794, + "learning_rate": 8.979953427153994e-05, + "loss": 1.9178, + "step": 2025 + }, + { + "epoch": 0.6149643344968888, + "grad_norm": 0.3859613239765167, + "learning_rate": 8.979447200566974e-05, + "loss": 1.7585, + "step": 2026 + }, + { + "epoch": 0.6152678706935802, + "grad_norm": 0.42048898339271545, + "learning_rate": 8.978940973979953e-05, + "loss": 1.953, + "step": 2027 + }, + { + "epoch": 0.6155714068902717, + "grad_norm": 0.39669451117515564, + "learning_rate": 8.978434747392933e-05, + "loss": 1.6132, + "step": 2028 + }, + { + "epoch": 0.6158749430869631, + "grad_norm": 0.6679760217666626, + "learning_rate": 8.977928520805912e-05, + "loss": 1.9793, + "step": 2029 + }, + { + "epoch": 0.6161784792836545, + "grad_norm": 0.4262414276599884, + "learning_rate": 8.977422294218892e-05, + "loss": 1.8002, + "step": 2030 + }, + { + "epoch": 0.616482015480346, + "grad_norm": 0.3899317681789398, + "learning_rate": 8.976916067631871e-05, + "loss": 2.0585, + "step": 2031 + }, + { + "epoch": 0.6167855516770375, + "grad_norm": 0.5402538776397705, + "learning_rate": 8.976409841044852e-05, + "loss": 1.6196, + "step": 2032 + }, + { + "epoch": 0.617089087873729, + "grad_norm": 0.40976065397262573, + "learning_rate": 8.975903614457832e-05, + "loss": 1.6395, + "step": 2033 + }, + { + "epoch": 0.6173926240704204, + "grad_norm": 0.5633681416511536, + "learning_rate": 8.975397387870811e-05, + "loss": 2.117, + "step": 2034 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.4393365681171417, + "learning_rate": 8.974891161283792e-05, + "loss": 2.0793, + "step": 2035 + }, + { + "epoch": 0.6179996964638033, + "grad_norm": 0.3982914388179779, + "learning_rate": 8.974384934696771e-05, + "loss": 1.9608, + "step": 2036 + }, + { + "epoch": 0.6183032326604948, + "grad_norm": 0.41689884662628174, + "learning_rate": 8.973878708109751e-05, + "loss": 2.0265, + "step": 2037 + }, + { + "epoch": 0.6186067688571862, + "grad_norm": 0.46085304021835327, + "learning_rate": 8.97337248152273e-05, + "loss": 1.8147, + "step": 2038 + }, + { + "epoch": 0.6189103050538777, + "grad_norm": 0.4536703824996948, + "learning_rate": 8.97286625493571e-05, + "loss": 1.861, + "step": 2039 + }, + { + "epoch": 0.6192138412505691, + "grad_norm": 0.4332161843776703, + "learning_rate": 8.97236002834869e-05, + "loss": 1.5579, + "step": 2040 + }, + { + "epoch": 0.6195173774472605, + "grad_norm": 0.3992736041545868, + "learning_rate": 8.971853801761669e-05, + "loss": 1.9451, + "step": 2041 + }, + { + "epoch": 0.6198209136439521, + "grad_norm": 0.39501848816871643, + "learning_rate": 8.971347575174648e-05, + "loss": 1.9228, + "step": 2042 + }, + { + "epoch": 0.6201244498406435, + "grad_norm": 0.44429096579551697, + "learning_rate": 8.970841348587629e-05, + "loss": 1.6277, + "step": 2043 + }, + { + "epoch": 0.620427986037335, + "grad_norm": 0.5381520390510559, + "learning_rate": 8.970335122000609e-05, + "loss": 1.4842, + "step": 2044 + }, + { + "epoch": 0.6207315222340264, + "grad_norm": 0.3807857036590576, + "learning_rate": 8.969828895413588e-05, + "loss": 1.9271, + "step": 2045 + }, + { + "epoch": 0.6210350584307178, + "grad_norm": 0.4522213041782379, + "learning_rate": 8.969322668826568e-05, + "loss": 1.747, + "step": 2046 + }, + { + "epoch": 0.6213385946274094, + "grad_norm": 0.37111926078796387, + "learning_rate": 8.968816442239547e-05, + "loss": 1.9039, + "step": 2047 + }, + { + "epoch": 0.6216421308241008, + "grad_norm": 0.7616074681282043, + "learning_rate": 8.968310215652527e-05, + "loss": 2.0966, + "step": 2048 + }, + { + "epoch": 0.6219456670207922, + "grad_norm": 0.42669475078582764, + "learning_rate": 8.967803989065506e-05, + "loss": 2.0545, + "step": 2049 + }, + { + "epoch": 0.6222492032174837, + "grad_norm": 0.3741990625858307, + "learning_rate": 8.967297762478486e-05, + "loss": 1.9964, + "step": 2050 + }, + { + "epoch": 0.6225527394141751, + "grad_norm": 0.38759157061576843, + "learning_rate": 8.966791535891465e-05, + "loss": 1.8108, + "step": 2051 + }, + { + "epoch": 0.6228562756108666, + "grad_norm": 0.424344003200531, + "learning_rate": 8.966285309304446e-05, + "loss": 1.8411, + "step": 2052 + }, + { + "epoch": 0.6231598118075581, + "grad_norm": 0.3969878852367401, + "learning_rate": 8.965779082717425e-05, + "loss": 2.1543, + "step": 2053 + }, + { + "epoch": 0.6234633480042495, + "grad_norm": 0.4188143312931061, + "learning_rate": 8.965272856130405e-05, + "loss": 1.8696, + "step": 2054 + }, + { + "epoch": 0.623766884200941, + "grad_norm": 0.42061781883239746, + "learning_rate": 8.964766629543384e-05, + "loss": 2.0641, + "step": 2055 + }, + { + "epoch": 0.6240704203976324, + "grad_norm": 0.3898957371711731, + "learning_rate": 8.964260402956364e-05, + "loss": 1.9002, + "step": 2056 + }, + { + "epoch": 0.6243739565943238, + "grad_norm": 0.4503360688686371, + "learning_rate": 8.963754176369343e-05, + "loss": 1.8302, + "step": 2057 + }, + { + "epoch": 0.6246774927910154, + "grad_norm": 0.44356441497802734, + "learning_rate": 8.963247949782323e-05, + "loss": 1.589, + "step": 2058 + }, + { + "epoch": 0.6249810289877068, + "grad_norm": 0.3989812433719635, + "learning_rate": 8.962741723195302e-05, + "loss": 2.1592, + "step": 2059 + }, + { + "epoch": 0.6252845651843982, + "grad_norm": 0.3959946632385254, + "learning_rate": 8.962235496608282e-05, + "loss": 2.0769, + "step": 2060 + }, + { + "epoch": 0.6255881013810897, + "grad_norm": 0.37260061502456665, + "learning_rate": 8.961729270021262e-05, + "loss": 1.2118, + "step": 2061 + }, + { + "epoch": 0.6258916375777811, + "grad_norm": 0.5840566754341125, + "learning_rate": 8.961223043434242e-05, + "loss": 2.1389, + "step": 2062 + }, + { + "epoch": 0.6261951737744726, + "grad_norm": 0.44715970754623413, + "learning_rate": 8.960716816847221e-05, + "loss": 1.6985, + "step": 2063 + }, + { + "epoch": 0.6264987099711641, + "grad_norm": 0.40047672390937805, + "learning_rate": 8.960210590260201e-05, + "loss": 1.9951, + "step": 2064 + }, + { + "epoch": 0.6268022461678555, + "grad_norm": 0.4090017080307007, + "learning_rate": 8.95970436367318e-05, + "loss": 1.8258, + "step": 2065 + }, + { + "epoch": 0.627105782364547, + "grad_norm": 0.39617207646369934, + "learning_rate": 8.95919813708616e-05, + "loss": 1.6203, + "step": 2066 + }, + { + "epoch": 0.6274093185612384, + "grad_norm": 0.4236812889575958, + "learning_rate": 8.95869191049914e-05, + "loss": 1.9766, + "step": 2067 + }, + { + "epoch": 0.6277128547579299, + "grad_norm": 0.560946524143219, + "learning_rate": 8.958185683912119e-05, + "loss": 2.3476, + "step": 2068 + }, + { + "epoch": 0.6280163909546214, + "grad_norm": 0.4474948048591614, + "learning_rate": 8.957679457325098e-05, + "loss": 1.9045, + "step": 2069 + }, + { + "epoch": 0.6283199271513128, + "grad_norm": 0.47307664155960083, + "learning_rate": 8.957173230738078e-05, + "loss": 1.9436, + "step": 2070 + }, + { + "epoch": 0.6286234633480042, + "grad_norm": 0.4518156945705414, + "learning_rate": 8.956667004151059e-05, + "loss": 1.6307, + "step": 2071 + }, + { + "epoch": 0.6289269995446957, + "grad_norm": 0.3907441794872284, + "learning_rate": 8.956160777564038e-05, + "loss": 1.9797, + "step": 2072 + }, + { + "epoch": 0.6292305357413872, + "grad_norm": 0.7602722644805908, + "learning_rate": 8.955654550977018e-05, + "loss": 1.467, + "step": 2073 + }, + { + "epoch": 0.6295340719380786, + "grad_norm": 0.4778296947479248, + "learning_rate": 8.955148324389997e-05, + "loss": 1.592, + "step": 2074 + }, + { + "epoch": 0.6298376081347701, + "grad_norm": 0.5303634405136108, + "learning_rate": 8.954642097802977e-05, + "loss": 1.9301, + "step": 2075 + }, + { + "epoch": 0.6301411443314615, + "grad_norm": 0.37609922885894775, + "learning_rate": 8.954135871215956e-05, + "loss": 2.0062, + "step": 2076 + }, + { + "epoch": 0.630444680528153, + "grad_norm": 0.3961854875087738, + "learning_rate": 8.953629644628936e-05, + "loss": 2.0677, + "step": 2077 + }, + { + "epoch": 0.6307482167248445, + "grad_norm": 0.43167874217033386, + "learning_rate": 8.953123418041915e-05, + "loss": 1.7997, + "step": 2078 + }, + { + "epoch": 0.6310517529215359, + "grad_norm": 0.4458840489387512, + "learning_rate": 8.952617191454896e-05, + "loss": 1.6458, + "step": 2079 + }, + { + "epoch": 0.6313552891182274, + "grad_norm": 0.8174628615379333, + "learning_rate": 8.952110964867875e-05, + "loss": 1.3436, + "step": 2080 + }, + { + "epoch": 0.6316588253149188, + "grad_norm": 0.40314528346061707, + "learning_rate": 8.951604738280855e-05, + "loss": 1.648, + "step": 2081 + }, + { + "epoch": 0.6319623615116102, + "grad_norm": 2.845505952835083, + "learning_rate": 8.951098511693836e-05, + "loss": 2.0645, + "step": 2082 + }, + { + "epoch": 0.6322658977083017, + "grad_norm": 0.41686686873435974, + "learning_rate": 8.950592285106815e-05, + "loss": 1.9434, + "step": 2083 + }, + { + "epoch": 0.6325694339049932, + "grad_norm": 5.579742431640625, + "learning_rate": 8.950086058519795e-05, + "loss": 2.1942, + "step": 2084 + }, + { + "epoch": 0.6328729701016846, + "grad_norm": 0.40614521503448486, + "learning_rate": 8.949579831932774e-05, + "loss": 1.7526, + "step": 2085 + }, + { + "epoch": 0.6331765062983761, + "grad_norm": 0.8609543442726135, + "learning_rate": 8.949073605345754e-05, + "loss": 1.4042, + "step": 2086 + }, + { + "epoch": 0.6334800424950675, + "grad_norm": 0.451594740152359, + "learning_rate": 8.948567378758733e-05, + "loss": 2.085, + "step": 2087 + }, + { + "epoch": 0.6337835786917589, + "grad_norm": 0.48546943068504333, + "learning_rate": 8.948061152171713e-05, + "loss": 1.8299, + "step": 2088 + }, + { + "epoch": 0.6340871148884505, + "grad_norm": 0.4435253143310547, + "learning_rate": 8.947554925584692e-05, + "loss": 1.9175, + "step": 2089 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.4109974801540375, + "learning_rate": 8.947048698997671e-05, + "loss": 1.6901, + "step": 2090 + }, + { + "epoch": 0.6346941872818334, + "grad_norm": 0.4205876290798187, + "learning_rate": 8.946542472410652e-05, + "loss": 1.6726, + "step": 2091 + }, + { + "epoch": 0.6349977234785248, + "grad_norm": 0.4449016749858856, + "learning_rate": 8.946036245823632e-05, + "loss": 2.2222, + "step": 2092 + }, + { + "epoch": 0.6353012596752162, + "grad_norm": 0.44236990809440613, + "learning_rate": 8.945530019236611e-05, + "loss": 2.2247, + "step": 2093 + }, + { + "epoch": 0.6356047958719078, + "grad_norm": 0.4700889587402344, + "learning_rate": 8.945023792649591e-05, + "loss": 1.6692, + "step": 2094 + }, + { + "epoch": 0.6359083320685992, + "grad_norm": 0.42525413632392883, + "learning_rate": 8.94451756606257e-05, + "loss": 1.5726, + "step": 2095 + }, + { + "epoch": 0.6362118682652906, + "grad_norm": 0.3753025233745575, + "learning_rate": 8.94401133947555e-05, + "loss": 1.8565, + "step": 2096 + }, + { + "epoch": 0.6365154044619821, + "grad_norm": 0.3908928632736206, + "learning_rate": 8.943505112888529e-05, + "loss": 1.9776, + "step": 2097 + }, + { + "epoch": 0.6368189406586735, + "grad_norm": 0.4409022927284241, + "learning_rate": 8.942998886301509e-05, + "loss": 1.8076, + "step": 2098 + }, + { + "epoch": 0.637122476855365, + "grad_norm": 0.4187740981578827, + "learning_rate": 8.942492659714488e-05, + "loss": 2.0177, + "step": 2099 + }, + { + "epoch": 0.6374260130520565, + "grad_norm": 0.4491542875766754, + "learning_rate": 8.941986433127469e-05, + "loss": 1.817, + "step": 2100 + }, + { + "epoch": 0.6377295492487479, + "grad_norm": 0.4964027404785156, + "learning_rate": 8.941480206540448e-05, + "loss": 1.618, + "step": 2101 + }, + { + "epoch": 0.6380330854454394, + "grad_norm": 0.4044201672077179, + "learning_rate": 8.940973979953428e-05, + "loss": 1.4983, + "step": 2102 + }, + { + "epoch": 0.6383366216421308, + "grad_norm": 0.4115463197231293, + "learning_rate": 8.940467753366407e-05, + "loss": 1.9043, + "step": 2103 + }, + { + "epoch": 0.6386401578388223, + "grad_norm": 0.39303481578826904, + "learning_rate": 8.939961526779387e-05, + "loss": 1.434, + "step": 2104 + }, + { + "epoch": 0.6389436940355138, + "grad_norm": 0.3657127618789673, + "learning_rate": 8.939455300192366e-05, + "loss": 1.9805, + "step": 2105 + }, + { + "epoch": 0.6392472302322052, + "grad_norm": 0.414969265460968, + "learning_rate": 8.938949073605346e-05, + "loss": 1.745, + "step": 2106 + }, + { + "epoch": 0.6395507664288966, + "grad_norm": 0.4560664892196655, + "learning_rate": 8.938442847018325e-05, + "loss": 1.6991, + "step": 2107 + }, + { + "epoch": 0.6398543026255881, + "grad_norm": 0.4387153387069702, + "learning_rate": 8.937936620431305e-05, + "loss": 1.7517, + "step": 2108 + }, + { + "epoch": 0.6401578388222796, + "grad_norm": 0.39767786860466003, + "learning_rate": 8.937430393844284e-05, + "loss": 1.4668, + "step": 2109 + }, + { + "epoch": 0.640461375018971, + "grad_norm": 0.4568266272544861, + "learning_rate": 8.936924167257265e-05, + "loss": 1.7829, + "step": 2110 + }, + { + "epoch": 0.6407649112156625, + "grad_norm": 0.3790264427661896, + "learning_rate": 8.936417940670245e-05, + "loss": 1.8335, + "step": 2111 + }, + { + "epoch": 0.6410684474123539, + "grad_norm": 0.39457952976226807, + "learning_rate": 8.935911714083224e-05, + "loss": 1.687, + "step": 2112 + }, + { + "epoch": 0.6413719836090453, + "grad_norm": 0.32461151480674744, + "learning_rate": 8.935405487496204e-05, + "loss": 1.4628, + "step": 2113 + }, + { + "epoch": 0.6416755198057368, + "grad_norm": 0.36477747559547424, + "learning_rate": 8.934899260909183e-05, + "loss": 1.9856, + "step": 2114 + }, + { + "epoch": 0.6419790560024283, + "grad_norm": 0.4230240285396576, + "learning_rate": 8.934393034322163e-05, + "loss": 2.037, + "step": 2115 + }, + { + "epoch": 0.6422825921991198, + "grad_norm": 0.3885568380355835, + "learning_rate": 8.933886807735142e-05, + "loss": 1.7229, + "step": 2116 + }, + { + "epoch": 0.6425861283958112, + "grad_norm": 0.46484097838401794, + "learning_rate": 8.933380581148121e-05, + "loss": 1.9656, + "step": 2117 + }, + { + "epoch": 0.6428896645925026, + "grad_norm": 0.3922126591205597, + "learning_rate": 8.932874354561101e-05, + "loss": 1.652, + "step": 2118 + }, + { + "epoch": 0.6431932007891941, + "grad_norm": 0.4676629602909088, + "learning_rate": 8.932368127974082e-05, + "loss": 2.1305, + "step": 2119 + }, + { + "epoch": 0.6434967369858856, + "grad_norm": 0.3731312155723572, + "learning_rate": 8.931861901387061e-05, + "loss": 2.0093, + "step": 2120 + }, + { + "epoch": 0.643800273182577, + "grad_norm": 0.44040486216545105, + "learning_rate": 8.931355674800041e-05, + "loss": 1.9446, + "step": 2121 + }, + { + "epoch": 0.6441038093792685, + "grad_norm": 0.3713996112346649, + "learning_rate": 8.93084944821302e-05, + "loss": 2.1773, + "step": 2122 + }, + { + "epoch": 0.6444073455759599, + "grad_norm": 0.3798523247241974, + "learning_rate": 8.930343221626e-05, + "loss": 1.7056, + "step": 2123 + }, + { + "epoch": 0.6447108817726513, + "grad_norm": 0.4175238013267517, + "learning_rate": 8.92983699503898e-05, + "loss": 1.966, + "step": 2124 + }, + { + "epoch": 0.6450144179693429, + "grad_norm": 0.40957748889923096, + "learning_rate": 8.92933076845196e-05, + "loss": 1.6715, + "step": 2125 + }, + { + "epoch": 0.6453179541660343, + "grad_norm": 0.46979820728302, + "learning_rate": 8.92882454186494e-05, + "loss": 1.8604, + "step": 2126 + }, + { + "epoch": 0.6456214903627258, + "grad_norm": 0.3671952188014984, + "learning_rate": 8.928318315277919e-05, + "loss": 1.1596, + "step": 2127 + }, + { + "epoch": 0.6459250265594172, + "grad_norm": 0.363288551568985, + "learning_rate": 8.927812088690898e-05, + "loss": 2.14, + "step": 2128 + }, + { + "epoch": 0.6462285627561086, + "grad_norm": 0.3632570505142212, + "learning_rate": 8.927305862103878e-05, + "loss": 2.0746, + "step": 2129 + }, + { + "epoch": 0.6465320989528002, + "grad_norm": 0.5912741422653198, + "learning_rate": 8.926799635516859e-05, + "loss": 2.1828, + "step": 2130 + }, + { + "epoch": 0.6468356351494916, + "grad_norm": 0.3740077018737793, + "learning_rate": 8.926293408929838e-05, + "loss": 1.9439, + "step": 2131 + }, + { + "epoch": 0.647139171346183, + "grad_norm": 0.5042386651039124, + "learning_rate": 8.925787182342818e-05, + "loss": 1.1905, + "step": 2132 + }, + { + "epoch": 0.6474427075428745, + "grad_norm": 0.39761942625045776, + "learning_rate": 8.925280955755797e-05, + "loss": 1.3763, + "step": 2133 + }, + { + "epoch": 0.6477462437395659, + "grad_norm": 0.6671484112739563, + "learning_rate": 8.924774729168777e-05, + "loss": 2.2412, + "step": 2134 + }, + { + "epoch": 0.6480497799362575, + "grad_norm": 0.40470197796821594, + "learning_rate": 8.924268502581756e-05, + "loss": 2.0007, + "step": 2135 + }, + { + "epoch": 0.6483533161329489, + "grad_norm": 1.5381660461425781, + "learning_rate": 8.923762275994736e-05, + "loss": 2.101, + "step": 2136 + }, + { + "epoch": 0.6486568523296403, + "grad_norm": 0.39186039566993713, + "learning_rate": 8.923256049407715e-05, + "loss": 1.8134, + "step": 2137 + }, + { + "epoch": 0.6489603885263318, + "grad_norm": 0.351701021194458, + "learning_rate": 8.922749822820695e-05, + "loss": 1.6034, + "step": 2138 + }, + { + "epoch": 0.6492639247230232, + "grad_norm": 1.6814361810684204, + "learning_rate": 8.922243596233675e-05, + "loss": 1.6059, + "step": 2139 + }, + { + "epoch": 0.6495674609197146, + "grad_norm": 0.4578597843647003, + "learning_rate": 8.921737369646655e-05, + "loss": 1.5098, + "step": 2140 + }, + { + "epoch": 0.6498709971164062, + "grad_norm": 0.44496893882751465, + "learning_rate": 8.921231143059634e-05, + "loss": 2.0494, + "step": 2141 + }, + { + "epoch": 0.6501745333130976, + "grad_norm": 0.3577191233634949, + "learning_rate": 8.920724916472614e-05, + "loss": 1.8182, + "step": 2142 + }, + { + "epoch": 0.650478069509789, + "grad_norm": 0.42032182216644287, + "learning_rate": 8.920218689885593e-05, + "loss": 2.0543, + "step": 2143 + }, + { + "epoch": 0.6507816057064805, + "grad_norm": 0.3442663550376892, + "learning_rate": 8.919712463298573e-05, + "loss": 1.699, + "step": 2144 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.479557067155838, + "learning_rate": 8.919206236711552e-05, + "loss": 2.3661, + "step": 2145 + }, + { + "epoch": 0.6513886780998634, + "grad_norm": 0.4386119246482849, + "learning_rate": 8.918700010124532e-05, + "loss": 1.9253, + "step": 2146 + }, + { + "epoch": 0.6516922142965549, + "grad_norm": 0.38390249013900757, + "learning_rate": 8.918193783537511e-05, + "loss": 1.7774, + "step": 2147 + }, + { + "epoch": 0.6519957504932463, + "grad_norm": 0.3760508596897125, + "learning_rate": 8.917687556950491e-05, + "loss": 2.0236, + "step": 2148 + }, + { + "epoch": 0.6522992866899378, + "grad_norm": 0.41757336258888245, + "learning_rate": 8.917181330363472e-05, + "loss": 1.8861, + "step": 2149 + }, + { + "epoch": 0.6526028228866292, + "grad_norm": 0.40840038657188416, + "learning_rate": 8.916675103776451e-05, + "loss": 1.405, + "step": 2150 + }, + { + "epoch": 0.6529063590833207, + "grad_norm": 0.3661898672580719, + "learning_rate": 8.91616887718943e-05, + "loss": 1.6111, + "step": 2151 + }, + { + "epoch": 0.6532098952800122, + "grad_norm": 0.42466968297958374, + "learning_rate": 8.91566265060241e-05, + "loss": 1.6713, + "step": 2152 + }, + { + "epoch": 0.6535134314767036, + "grad_norm": 0.5033214092254639, + "learning_rate": 8.91515642401539e-05, + "loss": 2.0999, + "step": 2153 + }, + { + "epoch": 0.653816967673395, + "grad_norm": 0.3836124837398529, + "learning_rate": 8.914650197428369e-05, + "loss": 2.2854, + "step": 2154 + }, + { + "epoch": 0.6541205038700865, + "grad_norm": 0.42189982533454895, + "learning_rate": 8.914143970841348e-05, + "loss": 2.154, + "step": 2155 + }, + { + "epoch": 0.654424040066778, + "grad_norm": 0.3981611430644989, + "learning_rate": 8.913637744254328e-05, + "loss": 1.6845, + "step": 2156 + }, + { + "epoch": 0.6547275762634694, + "grad_norm": 0.4584210515022278, + "learning_rate": 8.913131517667307e-05, + "loss": 1.8831, + "step": 2157 + }, + { + "epoch": 0.6550311124601609, + "grad_norm": 0.42922207713127136, + "learning_rate": 8.912625291080288e-05, + "loss": 1.8187, + "step": 2158 + }, + { + "epoch": 0.6553346486568523, + "grad_norm": 0.4891490638256073, + "learning_rate": 8.912119064493268e-05, + "loss": 2.0393, + "step": 2159 + }, + { + "epoch": 0.6556381848535437, + "grad_norm": 0.44946572184562683, + "learning_rate": 8.911612837906247e-05, + "loss": 2.0362, + "step": 2160 + }, + { + "epoch": 0.6559417210502353, + "grad_norm": 0.5170040726661682, + "learning_rate": 8.911106611319227e-05, + "loss": 1.0148, + "step": 2161 + }, + { + "epoch": 0.6562452572469267, + "grad_norm": 0.45176056027412415, + "learning_rate": 8.910600384732206e-05, + "loss": 1.9087, + "step": 2162 + }, + { + "epoch": 0.6565487934436182, + "grad_norm": 0.3974052965641022, + "learning_rate": 8.910094158145186e-05, + "loss": 1.7759, + "step": 2163 + }, + { + "epoch": 0.6568523296403096, + "grad_norm": 0.4142087399959564, + "learning_rate": 8.909587931558165e-05, + "loss": 1.8639, + "step": 2164 + }, + { + "epoch": 0.657155865837001, + "grad_norm": 0.4220983386039734, + "learning_rate": 8.909081704971145e-05, + "loss": 1.4122, + "step": 2165 + }, + { + "epoch": 0.6574594020336925, + "grad_norm": 0.37949880957603455, + "learning_rate": 8.908575478384124e-05, + "loss": 1.9989, + "step": 2166 + }, + { + "epoch": 0.657762938230384, + "grad_norm": 0.35547998547554016, + "learning_rate": 8.908069251797105e-05, + "loss": 1.9514, + "step": 2167 + }, + { + "epoch": 0.6580664744270754, + "grad_norm": 0.4009557366371155, + "learning_rate": 8.907563025210084e-05, + "loss": 1.7043, + "step": 2168 + }, + { + "epoch": 0.6583700106237669, + "grad_norm": 0.38969942927360535, + "learning_rate": 8.907056798623065e-05, + "loss": 1.8512, + "step": 2169 + }, + { + "epoch": 0.6586735468204583, + "grad_norm": 0.4015234708786011, + "learning_rate": 8.906550572036045e-05, + "loss": 1.9016, + "step": 2170 + }, + { + "epoch": 0.6589770830171497, + "grad_norm": 0.45555707812309265, + "learning_rate": 8.906044345449024e-05, + "loss": 2.1088, + "step": 2171 + }, + { + "epoch": 0.6592806192138413, + "grad_norm": 0.3557066321372986, + "learning_rate": 8.905538118862004e-05, + "loss": 1.6273, + "step": 2172 + }, + { + "epoch": 0.6595841554105327, + "grad_norm": 0.44995880126953125, + "learning_rate": 8.905031892274983e-05, + "loss": 1.7946, + "step": 2173 + }, + { + "epoch": 0.6598876916072242, + "grad_norm": 0.40973517298698425, + "learning_rate": 8.904525665687963e-05, + "loss": 1.7571, + "step": 2174 + }, + { + "epoch": 0.6601912278039156, + "grad_norm": 0.3300071656703949, + "learning_rate": 8.904019439100942e-05, + "loss": 1.2977, + "step": 2175 + }, + { + "epoch": 0.660494764000607, + "grad_norm": 0.4011610746383667, + "learning_rate": 8.903513212513922e-05, + "loss": 2.0934, + "step": 2176 + }, + { + "epoch": 0.6607983001972986, + "grad_norm": 0.35637664794921875, + "learning_rate": 8.903006985926901e-05, + "loss": 1.9632, + "step": 2177 + }, + { + "epoch": 0.66110183639399, + "grad_norm": 0.45524492859840393, + "learning_rate": 8.902500759339882e-05, + "loss": 1.8951, + "step": 2178 + }, + { + "epoch": 0.6614053725906814, + "grad_norm": 0.45453348755836487, + "learning_rate": 8.901994532752861e-05, + "loss": 1.84, + "step": 2179 + }, + { + "epoch": 0.6617089087873729, + "grad_norm": 0.4106372892856598, + "learning_rate": 8.901488306165841e-05, + "loss": 2.171, + "step": 2180 + }, + { + "epoch": 0.6620124449840643, + "grad_norm": 0.6188797950744629, + "learning_rate": 8.90098207957882e-05, + "loss": 1.3866, + "step": 2181 + }, + { + "epoch": 0.6623159811807559, + "grad_norm": 0.3466598093509674, + "learning_rate": 8.9004758529918e-05, + "loss": 1.7782, + "step": 2182 + }, + { + "epoch": 0.6626195173774473, + "grad_norm": 0.4912582337856293, + "learning_rate": 8.899969626404779e-05, + "loss": 1.2761, + "step": 2183 + }, + { + "epoch": 0.6629230535741387, + "grad_norm": 0.46108344197273254, + "learning_rate": 8.899463399817759e-05, + "loss": 1.4373, + "step": 2184 + }, + { + "epoch": 0.6632265897708302, + "grad_norm": 0.5269731879234314, + "learning_rate": 8.898957173230738e-05, + "loss": 1.4146, + "step": 2185 + }, + { + "epoch": 0.6635301259675216, + "grad_norm": 0.4078417122364044, + "learning_rate": 8.898450946643718e-05, + "loss": 2.2392, + "step": 2186 + }, + { + "epoch": 0.6638336621642131, + "grad_norm": 0.36829376220703125, + "learning_rate": 8.897944720056697e-05, + "loss": 2.1447, + "step": 2187 + }, + { + "epoch": 0.6641371983609046, + "grad_norm": 0.38769134879112244, + "learning_rate": 8.897438493469678e-05, + "loss": 1.5877, + "step": 2188 + }, + { + "epoch": 0.664440734557596, + "grad_norm": 0.3847033381462097, + "learning_rate": 8.896932266882658e-05, + "loss": 1.9341, + "step": 2189 + }, + { + "epoch": 0.6647442707542874, + "grad_norm": 0.47725987434387207, + "learning_rate": 8.896426040295637e-05, + "loss": 2.122, + "step": 2190 + }, + { + "epoch": 0.6650478069509789, + "grad_norm": 0.4192405045032501, + "learning_rate": 8.895919813708617e-05, + "loss": 2.1324, + "step": 2191 + }, + { + "epoch": 0.6653513431476703, + "grad_norm": 0.5160967707633972, + "learning_rate": 8.895413587121596e-05, + "loss": 1.4909, + "step": 2192 + }, + { + "epoch": 0.6656548793443618, + "grad_norm": 0.437773734331131, + "learning_rate": 8.894907360534575e-05, + "loss": 1.7785, + "step": 2193 + }, + { + "epoch": 0.6659584155410533, + "grad_norm": 0.36092495918273926, + "learning_rate": 8.894401133947555e-05, + "loss": 1.9376, + "step": 2194 + }, + { + "epoch": 0.6662619517377447, + "grad_norm": 0.4263538122177124, + "learning_rate": 8.893894907360534e-05, + "loss": 2.1226, + "step": 2195 + }, + { + "epoch": 0.6665654879344362, + "grad_norm": 0.41431042551994324, + "learning_rate": 8.893388680773514e-05, + "loss": 2.1941, + "step": 2196 + }, + { + "epoch": 0.6668690241311276, + "grad_norm": 0.4508149027824402, + "learning_rate": 8.892882454186495e-05, + "loss": 1.8846, + "step": 2197 + }, + { + "epoch": 0.6671725603278191, + "grad_norm": 0.3481595516204834, + "learning_rate": 8.892376227599474e-05, + "loss": 1.9913, + "step": 2198 + }, + { + "epoch": 0.6674760965245106, + "grad_norm": 0.420114129781723, + "learning_rate": 8.891870001012454e-05, + "loss": 1.5997, + "step": 2199 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.44123902916908264, + "learning_rate": 8.891363774425433e-05, + "loss": 2.112, + "step": 2200 + }, + { + "epoch": 0.6680831689178934, + "grad_norm": 0.4254309833049774, + "learning_rate": 8.890857547838413e-05, + "loss": 1.8562, + "step": 2201 + }, + { + "epoch": 0.6683867051145849, + "grad_norm": 0.3660505414009094, + "learning_rate": 8.890351321251392e-05, + "loss": 1.8612, + "step": 2202 + }, + { + "epoch": 0.6686902413112764, + "grad_norm": 0.3969692587852478, + "learning_rate": 8.889845094664372e-05, + "loss": 2.2214, + "step": 2203 + }, + { + "epoch": 0.6689937775079678, + "grad_norm": 0.36976855993270874, + "learning_rate": 8.889338868077351e-05, + "loss": 1.8667, + "step": 2204 + }, + { + "epoch": 0.6692973137046593, + "grad_norm": 0.44680026173591614, + "learning_rate": 8.88883264149033e-05, + "loss": 2.0648, + "step": 2205 + }, + { + "epoch": 0.6696008499013507, + "grad_norm": 0.40876418352127075, + "learning_rate": 8.888326414903311e-05, + "loss": 2.027, + "step": 2206 + }, + { + "epoch": 0.6699043860980421, + "grad_norm": 0.35822147130966187, + "learning_rate": 8.887820188316291e-05, + "loss": 1.8987, + "step": 2207 + }, + { + "epoch": 0.6702079222947337, + "grad_norm": 0.41419175267219543, + "learning_rate": 8.88731396172927e-05, + "loss": 2.114, + "step": 2208 + }, + { + "epoch": 0.6705114584914251, + "grad_norm": 0.3790142834186554, + "learning_rate": 8.88680773514225e-05, + "loss": 1.8878, + "step": 2209 + }, + { + "epoch": 0.6708149946881166, + "grad_norm": 0.42493680119514465, + "learning_rate": 8.88630150855523e-05, + "loss": 1.8914, + "step": 2210 + }, + { + "epoch": 0.671118530884808, + "grad_norm": 0.34427767992019653, + "learning_rate": 8.885795281968209e-05, + "loss": 2.1252, + "step": 2211 + }, + { + "epoch": 0.6714220670814994, + "grad_norm": 0.43361228704452515, + "learning_rate": 8.885289055381188e-05, + "loss": 1.677, + "step": 2212 + }, + { + "epoch": 0.671725603278191, + "grad_norm": 0.3793098032474518, + "learning_rate": 8.884782828794169e-05, + "loss": 1.9914, + "step": 2213 + }, + { + "epoch": 0.6720291394748824, + "grad_norm": 0.4355357587337494, + "learning_rate": 8.884276602207149e-05, + "loss": 2.0324, + "step": 2214 + }, + { + "epoch": 0.6723326756715738, + "grad_norm": 0.41514819860458374, + "learning_rate": 8.883770375620128e-05, + "loss": 1.6874, + "step": 2215 + }, + { + "epoch": 0.6726362118682653, + "grad_norm": 0.4044744074344635, + "learning_rate": 8.883264149033108e-05, + "loss": 1.8879, + "step": 2216 + }, + { + "epoch": 0.6729397480649567, + "grad_norm": 0.38892972469329834, + "learning_rate": 8.882757922446088e-05, + "loss": 1.8009, + "step": 2217 + }, + { + "epoch": 0.6732432842616483, + "grad_norm": 0.41450080275535583, + "learning_rate": 8.882251695859068e-05, + "loss": 1.7411, + "step": 2218 + }, + { + "epoch": 0.6735468204583397, + "grad_norm": 0.41548603773117065, + "learning_rate": 8.881745469272047e-05, + "loss": 2.158, + "step": 2219 + }, + { + "epoch": 0.6738503566550311, + "grad_norm": 0.4178054928779602, + "learning_rate": 8.881239242685027e-05, + "loss": 1.7454, + "step": 2220 + }, + { + "epoch": 0.6741538928517226, + "grad_norm": 1.661353588104248, + "learning_rate": 8.880733016098006e-05, + "loss": 1.6626, + "step": 2221 + }, + { + "epoch": 0.674457429048414, + "grad_norm": 0.40055370330810547, + "learning_rate": 8.880226789510986e-05, + "loss": 1.9827, + "step": 2222 + }, + { + "epoch": 0.6747609652451054, + "grad_norm": 0.41323450207710266, + "learning_rate": 8.879720562923965e-05, + "loss": 1.6335, + "step": 2223 + }, + { + "epoch": 0.675064501441797, + "grad_norm": 0.4238756597042084, + "learning_rate": 8.879214336336945e-05, + "loss": 1.7076, + "step": 2224 + }, + { + "epoch": 0.6753680376384884, + "grad_norm": 0.39978405833244324, + "learning_rate": 8.878708109749924e-05, + "loss": 1.6837, + "step": 2225 + }, + { + "epoch": 0.6756715738351798, + "grad_norm": 0.4585546851158142, + "learning_rate": 8.878201883162904e-05, + "loss": 2.0821, + "step": 2226 + }, + { + "epoch": 0.6759751100318713, + "grad_norm": 0.40500447154045105, + "learning_rate": 8.877695656575885e-05, + "loss": 1.7139, + "step": 2227 + }, + { + "epoch": 0.6762786462285627, + "grad_norm": 0.4829038381576538, + "learning_rate": 8.877189429988864e-05, + "loss": 1.9029, + "step": 2228 + }, + { + "epoch": 0.6765821824252543, + "grad_norm": 0.4088328182697296, + "learning_rate": 8.876683203401844e-05, + "loss": 2.0233, + "step": 2229 + }, + { + "epoch": 0.6768857186219457, + "grad_norm": 0.4438897967338562, + "learning_rate": 8.876176976814823e-05, + "loss": 1.824, + "step": 2230 + }, + { + "epoch": 0.6771892548186371, + "grad_norm": 0.4573661684989929, + "learning_rate": 8.875670750227802e-05, + "loss": 2.0605, + "step": 2231 + }, + { + "epoch": 0.6774927910153286, + "grad_norm": 0.5133582949638367, + "learning_rate": 8.875164523640782e-05, + "loss": 1.7161, + "step": 2232 + }, + { + "epoch": 0.67779632721202, + "grad_norm": 0.3775865137577057, + "learning_rate": 8.874658297053761e-05, + "loss": 1.513, + "step": 2233 + }, + { + "epoch": 0.6780998634087115, + "grad_norm": 0.4122471809387207, + "learning_rate": 8.874152070466741e-05, + "loss": 1.8155, + "step": 2234 + }, + { + "epoch": 0.678403399605403, + "grad_norm": 0.6338900327682495, + "learning_rate": 8.87364584387972e-05, + "loss": 1.5857, + "step": 2235 + }, + { + "epoch": 0.6787069358020944, + "grad_norm": 0.41020557284355164, + "learning_rate": 8.873139617292701e-05, + "loss": 1.7888, + "step": 2236 + }, + { + "epoch": 0.6790104719987858, + "grad_norm": 0.3700268268585205, + "learning_rate": 8.872633390705681e-05, + "loss": 1.5622, + "step": 2237 + }, + { + "epoch": 0.6793140081954773, + "grad_norm": 0.4358409345149994, + "learning_rate": 8.87212716411866e-05, + "loss": 2.0885, + "step": 2238 + }, + { + "epoch": 0.6796175443921688, + "grad_norm": 0.4212052822113037, + "learning_rate": 8.87162093753164e-05, + "loss": 2.0268, + "step": 2239 + }, + { + "epoch": 0.6799210805888602, + "grad_norm": 0.7132793068885803, + "learning_rate": 8.871114710944619e-05, + "loss": 2.0234, + "step": 2240 + }, + { + "epoch": 0.6802246167855517, + "grad_norm": 0.38493213057518005, + "learning_rate": 8.870608484357599e-05, + "loss": 1.9204, + "step": 2241 + }, + { + "epoch": 0.6805281529822431, + "grad_norm": 0.3852492570877075, + "learning_rate": 8.870102257770578e-05, + "loss": 1.8373, + "step": 2242 + }, + { + "epoch": 0.6808316891789346, + "grad_norm": 0.5180450081825256, + "learning_rate": 8.869596031183558e-05, + "loss": 1.3947, + "step": 2243 + }, + { + "epoch": 0.6811352253756261, + "grad_norm": 0.46512570977211, + "learning_rate": 8.869089804596537e-05, + "loss": 2.2241, + "step": 2244 + }, + { + "epoch": 0.6814387615723175, + "grad_norm": 0.387101411819458, + "learning_rate": 8.868583578009518e-05, + "loss": 1.7226, + "step": 2245 + }, + { + "epoch": 0.681742297769009, + "grad_norm": 0.40807807445526123, + "learning_rate": 8.868077351422497e-05, + "loss": 1.709, + "step": 2246 + }, + { + "epoch": 0.6820458339657004, + "grad_norm": 0.4039689600467682, + "learning_rate": 8.867571124835477e-05, + "loss": 1.6902, + "step": 2247 + }, + { + "epoch": 0.6823493701623918, + "grad_norm": 0.7101170420646667, + "learning_rate": 8.867064898248456e-05, + "loss": 1.6965, + "step": 2248 + }, + { + "epoch": 0.6826529063590833, + "grad_norm": 0.42346715927124023, + "learning_rate": 8.866558671661436e-05, + "loss": 1.4751, + "step": 2249 + }, + { + "epoch": 0.6829564425557748, + "grad_norm": 0.42237260937690735, + "learning_rate": 8.866052445074415e-05, + "loss": 1.8187, + "step": 2250 + }, + { + "epoch": 0.6832599787524662, + "grad_norm": 0.4752514958381653, + "learning_rate": 8.865546218487395e-05, + "loss": 1.7068, + "step": 2251 + }, + { + "epoch": 0.6835635149491577, + "grad_norm": 0.4356227219104767, + "learning_rate": 8.865039991900374e-05, + "loss": 2.0188, + "step": 2252 + }, + { + "epoch": 0.6838670511458491, + "grad_norm": 0.5964135527610779, + "learning_rate": 8.864533765313354e-05, + "loss": 1.5663, + "step": 2253 + }, + { + "epoch": 0.6841705873425405, + "grad_norm": 0.40307527780532837, + "learning_rate": 8.864027538726333e-05, + "loss": 1.8429, + "step": 2254 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.4318184554576874, + "learning_rate": 8.863521312139314e-05, + "loss": 1.7906, + "step": 2255 + }, + { + "epoch": 0.6847776597359235, + "grad_norm": 0.4366863965988159, + "learning_rate": 8.863015085552294e-05, + "loss": 1.9137, + "step": 2256 + }, + { + "epoch": 0.685081195932615, + "grad_norm": 0.4154497981071472, + "learning_rate": 8.862508858965274e-05, + "loss": 2.0196, + "step": 2257 + }, + { + "epoch": 0.6853847321293064, + "grad_norm": 0.39866191148757935, + "learning_rate": 8.862002632378254e-05, + "loss": 1.9397, + "step": 2258 + }, + { + "epoch": 0.6856882683259978, + "grad_norm": 0.42318910360336304, + "learning_rate": 8.861496405791233e-05, + "loss": 2.1399, + "step": 2259 + }, + { + "epoch": 0.6859918045226894, + "grad_norm": 0.4558073878288269, + "learning_rate": 8.860990179204213e-05, + "loss": 1.9519, + "step": 2260 + }, + { + "epoch": 0.6862953407193808, + "grad_norm": 0.45745977759361267, + "learning_rate": 8.860483952617192e-05, + "loss": 1.3459, + "step": 2261 + }, + { + "epoch": 0.6865988769160722, + "grad_norm": 0.4418570399284363, + "learning_rate": 8.859977726030172e-05, + "loss": 1.9207, + "step": 2262 + }, + { + "epoch": 0.6869024131127637, + "grad_norm": 0.3995778560638428, + "learning_rate": 8.859471499443151e-05, + "loss": 1.8226, + "step": 2263 + }, + { + "epoch": 0.6872059493094551, + "grad_norm": 0.5238348841667175, + "learning_rate": 8.858965272856131e-05, + "loss": 1.7849, + "step": 2264 + }, + { + "epoch": 0.6875094855061467, + "grad_norm": 0.3357613682746887, + "learning_rate": 8.85845904626911e-05, + "loss": 1.4962, + "step": 2265 + }, + { + "epoch": 0.6878130217028381, + "grad_norm": 0.45454543828964233, + "learning_rate": 8.857952819682091e-05, + "loss": 1.8111, + "step": 2266 + }, + { + "epoch": 0.6881165578995295, + "grad_norm": 0.4192381501197815, + "learning_rate": 8.85744659309507e-05, + "loss": 1.7505, + "step": 2267 + }, + { + "epoch": 0.688420094096221, + "grad_norm": 0.4213124215602875, + "learning_rate": 8.85694036650805e-05, + "loss": 1.9307, + "step": 2268 + }, + { + "epoch": 0.6887236302929124, + "grad_norm": 0.42022505402565, + "learning_rate": 8.85643413992103e-05, + "loss": 1.9889, + "step": 2269 + }, + { + "epoch": 0.6890271664896039, + "grad_norm": 0.42116105556488037, + "learning_rate": 8.855927913334009e-05, + "loss": 1.7128, + "step": 2270 + }, + { + "epoch": 0.6893307026862954, + "grad_norm": 1.0270413160324097, + "learning_rate": 8.855421686746988e-05, + "loss": 1.3238, + "step": 2271 + }, + { + "epoch": 0.6896342388829868, + "grad_norm": 0.42168179154396057, + "learning_rate": 8.854915460159968e-05, + "loss": 1.6624, + "step": 2272 + }, + { + "epoch": 0.6899377750796782, + "grad_norm": 0.3498344421386719, + "learning_rate": 8.854409233572947e-05, + "loss": 1.7382, + "step": 2273 + }, + { + "epoch": 0.6902413112763697, + "grad_norm": 0.42905229330062866, + "learning_rate": 8.853903006985927e-05, + "loss": 2.0058, + "step": 2274 + }, + { + "epoch": 0.6905448474730611, + "grad_norm": 0.41980302333831787, + "learning_rate": 8.853396780398908e-05, + "loss": 1.4661, + "step": 2275 + }, + { + "epoch": 0.6908483836697527, + "grad_norm": 0.5022958517074585, + "learning_rate": 8.852890553811887e-05, + "loss": 1.6155, + "step": 2276 + }, + { + "epoch": 0.6911519198664441, + "grad_norm": 0.4186939597129822, + "learning_rate": 8.852384327224867e-05, + "loss": 1.7008, + "step": 2277 + }, + { + "epoch": 0.6914554560631355, + "grad_norm": 0.39082199335098267, + "learning_rate": 8.851878100637846e-05, + "loss": 2.1537, + "step": 2278 + }, + { + "epoch": 0.691758992259827, + "grad_norm": 0.42378294467926025, + "learning_rate": 8.851371874050826e-05, + "loss": 2.0288, + "step": 2279 + }, + { + "epoch": 0.6920625284565184, + "grad_norm": 0.36108916997909546, + "learning_rate": 8.850865647463805e-05, + "loss": 2.0387, + "step": 2280 + }, + { + "epoch": 0.6923660646532099, + "grad_norm": 0.4613724648952484, + "learning_rate": 8.850359420876785e-05, + "loss": 1.7304, + "step": 2281 + }, + { + "epoch": 0.6926696008499014, + "grad_norm": 0.4140026867389679, + "learning_rate": 8.849853194289764e-05, + "loss": 1.9746, + "step": 2282 + }, + { + "epoch": 0.6929731370465928, + "grad_norm": 0.43233832716941833, + "learning_rate": 8.849346967702744e-05, + "loss": 1.9922, + "step": 2283 + }, + { + "epoch": 0.6932766732432842, + "grad_norm": 0.8021528124809265, + "learning_rate": 8.848840741115724e-05, + "loss": 2.1604, + "step": 2284 + }, + { + "epoch": 0.6935802094399757, + "grad_norm": 0.4009002447128296, + "learning_rate": 8.848334514528704e-05, + "loss": 1.4224, + "step": 2285 + }, + { + "epoch": 0.6938837456366672, + "grad_norm": 0.3914124369621277, + "learning_rate": 8.847828287941683e-05, + "loss": 1.8222, + "step": 2286 + }, + { + "epoch": 0.6941872818333586, + "grad_norm": 0.41309481859207153, + "learning_rate": 8.847322061354663e-05, + "loss": 1.9296, + "step": 2287 + }, + { + "epoch": 0.6944908180300501, + "grad_norm": 0.5561639666557312, + "learning_rate": 8.846815834767642e-05, + "loss": 1.3577, + "step": 2288 + }, + { + "epoch": 0.6947943542267415, + "grad_norm": 0.41699445247650146, + "learning_rate": 8.846309608180622e-05, + "loss": 1.8751, + "step": 2289 + }, + { + "epoch": 0.695097890423433, + "grad_norm": 0.3643866181373596, + "learning_rate": 8.845803381593601e-05, + "loss": 1.5099, + "step": 2290 + }, + { + "epoch": 0.6954014266201245, + "grad_norm": 0.44212576746940613, + "learning_rate": 8.845297155006581e-05, + "loss": 1.8293, + "step": 2291 + }, + { + "epoch": 0.6957049628168159, + "grad_norm": 0.36881545186042786, + "learning_rate": 8.84479092841956e-05, + "loss": 1.6359, + "step": 2292 + }, + { + "epoch": 0.6960084990135074, + "grad_norm": 0.3785519003868103, + "learning_rate": 8.84428470183254e-05, + "loss": 1.63, + "step": 2293 + }, + { + "epoch": 0.6963120352101988, + "grad_norm": 0.6767301559448242, + "learning_rate": 8.84377847524552e-05, + "loss": 1.0786, + "step": 2294 + }, + { + "epoch": 0.6966155714068902, + "grad_norm": 0.38619041442871094, + "learning_rate": 8.8432722486585e-05, + "loss": 1.4952, + "step": 2295 + }, + { + "epoch": 0.6969191076035818, + "grad_norm": 0.6698863506317139, + "learning_rate": 8.84276602207148e-05, + "loss": 2.0425, + "step": 2296 + }, + { + "epoch": 0.6972226438002732, + "grad_norm": 0.4446139931678772, + "learning_rate": 8.842259795484459e-05, + "loss": 1.7511, + "step": 2297 + }, + { + "epoch": 0.6975261799969646, + "grad_norm": 0.6287797093391418, + "learning_rate": 8.841753568897438e-05, + "loss": 1.8198, + "step": 2298 + }, + { + "epoch": 0.6978297161936561, + "grad_norm": 0.3704979717731476, + "learning_rate": 8.841247342310418e-05, + "loss": 1.44, + "step": 2299 + }, + { + "epoch": 0.6981332523903475, + "grad_norm": 0.4163188636302948, + "learning_rate": 8.840741115723397e-05, + "loss": 1.623, + "step": 2300 + }, + { + "epoch": 0.6984367885870391, + "grad_norm": 0.3959861993789673, + "learning_rate": 8.840234889136377e-05, + "loss": 1.9259, + "step": 2301 + }, + { + "epoch": 0.6987403247837305, + "grad_norm": 0.5066584348678589, + "learning_rate": 8.839728662549358e-05, + "loss": 1.8163, + "step": 2302 + }, + { + "epoch": 0.6990438609804219, + "grad_norm": 0.4553223252296448, + "learning_rate": 8.839222435962337e-05, + "loss": 1.4527, + "step": 2303 + }, + { + "epoch": 0.6993473971771134, + "grad_norm": 0.47616320848464966, + "learning_rate": 8.838716209375317e-05, + "loss": 1.8694, + "step": 2304 + }, + { + "epoch": 0.6996509333738048, + "grad_norm": 0.4735199511051178, + "learning_rate": 8.838209982788297e-05, + "loss": 1.7023, + "step": 2305 + }, + { + "epoch": 0.6999544695704962, + "grad_norm": 0.427415668964386, + "learning_rate": 8.837703756201277e-05, + "loss": 2.0255, + "step": 2306 + }, + { + "epoch": 0.7002580057671878, + "grad_norm": 0.35021111369132996, + "learning_rate": 8.837197529614256e-05, + "loss": 2.1431, + "step": 2307 + }, + { + "epoch": 0.7005615419638792, + "grad_norm": 0.35905367136001587, + "learning_rate": 8.836691303027236e-05, + "loss": 1.2157, + "step": 2308 + }, + { + "epoch": 0.7008650781605706, + "grad_norm": 0.39521825313568115, + "learning_rate": 8.836185076440215e-05, + "loss": 1.8051, + "step": 2309 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.42580053210258484, + "learning_rate": 8.835678849853195e-05, + "loss": 1.9845, + "step": 2310 + }, + { + "epoch": 0.7014721505539535, + "grad_norm": 0.6940016150474548, + "learning_rate": 8.835172623266174e-05, + "loss": 2.0824, + "step": 2311 + }, + { + "epoch": 0.7017756867506451, + "grad_norm": 0.44518351554870605, + "learning_rate": 8.834666396679154e-05, + "loss": 2.0919, + "step": 2312 + }, + { + "epoch": 0.7020792229473365, + "grad_norm": 0.4215528070926666, + "learning_rate": 8.834160170092133e-05, + "loss": 1.9665, + "step": 2313 + }, + { + "epoch": 0.7023827591440279, + "grad_norm": 0.4305053651332855, + "learning_rate": 8.833653943505114e-05, + "loss": 2.0338, + "step": 2314 + }, + { + "epoch": 0.7026862953407194, + "grad_norm": 0.7952874302864075, + "learning_rate": 8.833147716918094e-05, + "loss": 1.6609, + "step": 2315 + }, + { + "epoch": 0.7029898315374108, + "grad_norm": 0.42054691910743713, + "learning_rate": 8.832641490331073e-05, + "loss": 1.3244, + "step": 2316 + }, + { + "epoch": 0.7032933677341023, + "grad_norm": 0.3898642659187317, + "learning_rate": 8.832135263744053e-05, + "loss": 1.5618, + "step": 2317 + }, + { + "epoch": 0.7035969039307938, + "grad_norm": 0.4959927201271057, + "learning_rate": 8.831629037157032e-05, + "loss": 1.999, + "step": 2318 + }, + { + "epoch": 0.7039004401274852, + "grad_norm": 0.4488220810890198, + "learning_rate": 8.831122810570012e-05, + "loss": 1.946, + "step": 2319 + }, + { + "epoch": 0.7042039763241766, + "grad_norm": 0.4661828577518463, + "learning_rate": 8.830616583982991e-05, + "loss": 1.9832, + "step": 2320 + }, + { + "epoch": 0.7045075125208681, + "grad_norm": 0.3740446865558624, + "learning_rate": 8.83011035739597e-05, + "loss": 1.8934, + "step": 2321 + }, + { + "epoch": 0.7048110487175596, + "grad_norm": 0.4690150022506714, + "learning_rate": 8.82960413080895e-05, + "loss": 1.9083, + "step": 2322 + }, + { + "epoch": 0.705114584914251, + "grad_norm": 0.3680610954761505, + "learning_rate": 8.829097904221931e-05, + "loss": 1.9494, + "step": 2323 + }, + { + "epoch": 0.7054181211109425, + "grad_norm": 0.40403270721435547, + "learning_rate": 8.82859167763491e-05, + "loss": 2.1369, + "step": 2324 + }, + { + "epoch": 0.7057216573076339, + "grad_norm": 0.4465244710445404, + "learning_rate": 8.82808545104789e-05, + "loss": 1.8854, + "step": 2325 + }, + { + "epoch": 0.7060251935043254, + "grad_norm": 0.45881539583206177, + "learning_rate": 8.827579224460869e-05, + "loss": 1.6319, + "step": 2326 + }, + { + "epoch": 0.7063287297010169, + "grad_norm": 0.43863871693611145, + "learning_rate": 8.827072997873849e-05, + "loss": 1.8284, + "step": 2327 + }, + { + "epoch": 0.7066322658977083, + "grad_norm": 0.3942803740501404, + "learning_rate": 8.826566771286828e-05, + "loss": 1.8663, + "step": 2328 + }, + { + "epoch": 0.7069358020943998, + "grad_norm": 0.42162778973579407, + "learning_rate": 8.826060544699808e-05, + "loss": 1.8563, + "step": 2329 + }, + { + "epoch": 0.7072393382910912, + "grad_norm": 0.4088474214076996, + "learning_rate": 8.825554318112787e-05, + "loss": 2.2026, + "step": 2330 + }, + { + "epoch": 0.7075428744877826, + "grad_norm": 0.43421268463134766, + "learning_rate": 8.825048091525767e-05, + "loss": 2.2395, + "step": 2331 + }, + { + "epoch": 0.7078464106844741, + "grad_norm": 0.4430371820926666, + "learning_rate": 8.824541864938746e-05, + "loss": 1.7794, + "step": 2332 + }, + { + "epoch": 0.7081499468811656, + "grad_norm": 0.41605162620544434, + "learning_rate": 8.824035638351727e-05, + "loss": 2.1565, + "step": 2333 + }, + { + "epoch": 0.708453483077857, + "grad_norm": 0.3622266352176666, + "learning_rate": 8.823529411764706e-05, + "loss": 1.4366, + "step": 2334 + }, + { + "epoch": 0.7087570192745485, + "grad_norm": 0.4030252695083618, + "learning_rate": 8.823023185177686e-05, + "loss": 1.7127, + "step": 2335 + }, + { + "epoch": 0.7090605554712399, + "grad_norm": 0.4723038375377655, + "learning_rate": 8.822516958590665e-05, + "loss": 1.6736, + "step": 2336 + }, + { + "epoch": 0.7093640916679314, + "grad_norm": 0.39395782351493835, + "learning_rate": 8.822010732003645e-05, + "loss": 1.8631, + "step": 2337 + }, + { + "epoch": 0.7096676278646229, + "grad_norm": 0.4566243290901184, + "learning_rate": 8.821504505416624e-05, + "loss": 1.8543, + "step": 2338 + }, + { + "epoch": 0.7099711640613143, + "grad_norm": 0.3434160351753235, + "learning_rate": 8.820998278829604e-05, + "loss": 1.9953, + "step": 2339 + }, + { + "epoch": 0.7102747002580058, + "grad_norm": 0.5802703499794006, + "learning_rate": 8.820492052242583e-05, + "loss": 1.7014, + "step": 2340 + }, + { + "epoch": 0.7105782364546972, + "grad_norm": 1.1318562030792236, + "learning_rate": 8.819985825655563e-05, + "loss": 1.9866, + "step": 2341 + }, + { + "epoch": 0.7108817726513886, + "grad_norm": 0.8500426411628723, + "learning_rate": 8.819479599068544e-05, + "loss": 1.2291, + "step": 2342 + }, + { + "epoch": 0.7111853088480802, + "grad_norm": 0.4189402163028717, + "learning_rate": 8.818973372481523e-05, + "loss": 1.9396, + "step": 2343 + }, + { + "epoch": 0.7114888450447716, + "grad_norm": 0.7509348392486572, + "learning_rate": 8.818467145894503e-05, + "loss": 2.168, + "step": 2344 + }, + { + "epoch": 0.711792381241463, + "grad_norm": 0.42071589827537537, + "learning_rate": 8.817960919307482e-05, + "loss": 1.9493, + "step": 2345 + }, + { + "epoch": 0.7120959174381545, + "grad_norm": 0.7269922494888306, + "learning_rate": 8.817454692720463e-05, + "loss": 1.6344, + "step": 2346 + }, + { + "epoch": 0.7123994536348459, + "grad_norm": 0.5446398854255676, + "learning_rate": 8.816948466133442e-05, + "loss": 1.9525, + "step": 2347 + }, + { + "epoch": 0.7127029898315375, + "grad_norm": 0.43752509355545044, + "learning_rate": 8.816442239546422e-05, + "loss": 1.9692, + "step": 2348 + }, + { + "epoch": 0.7130065260282289, + "grad_norm": 0.4986307919025421, + "learning_rate": 8.815936012959401e-05, + "loss": 1.805, + "step": 2349 + }, + { + "epoch": 0.7133100622249203, + "grad_norm": 0.47302186489105225, + "learning_rate": 8.815429786372381e-05, + "loss": 2.2322, + "step": 2350 + }, + { + "epoch": 0.7136135984216118, + "grad_norm": 0.4359181523323059, + "learning_rate": 8.81492355978536e-05, + "loss": 1.7579, + "step": 2351 + }, + { + "epoch": 0.7139171346183032, + "grad_norm": 0.9149986505508423, + "learning_rate": 8.81441733319834e-05, + "loss": 1.4888, + "step": 2352 + }, + { + "epoch": 0.7142206708149947, + "grad_norm": 0.37777209281921387, + "learning_rate": 8.81391110661132e-05, + "loss": 2.0646, + "step": 2353 + }, + { + "epoch": 0.7145242070116862, + "grad_norm": 0.527703046798706, + "learning_rate": 8.8134048800243e-05, + "loss": 1.9213, + "step": 2354 + }, + { + "epoch": 0.7148277432083776, + "grad_norm": 0.41505396366119385, + "learning_rate": 8.81289865343728e-05, + "loss": 1.7469, + "step": 2355 + }, + { + "epoch": 0.715131279405069, + "grad_norm": 0.44212964177131653, + "learning_rate": 8.812392426850259e-05, + "loss": 1.2739, + "step": 2356 + }, + { + "epoch": 0.7154348156017605, + "grad_norm": 0.3863414525985718, + "learning_rate": 8.811886200263239e-05, + "loss": 2.2481, + "step": 2357 + }, + { + "epoch": 0.7157383517984519, + "grad_norm": 0.45853668451309204, + "learning_rate": 8.811379973676218e-05, + "loss": 2.2813, + "step": 2358 + }, + { + "epoch": 0.7160418879951435, + "grad_norm": 11.417152404785156, + "learning_rate": 8.810873747089198e-05, + "loss": 2.0079, + "step": 2359 + }, + { + "epoch": 0.7163454241918349, + "grad_norm": 0.49986690282821655, + "learning_rate": 8.810367520502177e-05, + "loss": 1.5595, + "step": 2360 + }, + { + "epoch": 0.7166489603885263, + "grad_norm": 0.4734189510345459, + "learning_rate": 8.809861293915156e-05, + "loss": 1.8213, + "step": 2361 + }, + { + "epoch": 0.7169524965852178, + "grad_norm": 0.43908554315567017, + "learning_rate": 8.809355067328137e-05, + "loss": 1.889, + "step": 2362 + }, + { + "epoch": 0.7172560327819092, + "grad_norm": 0.48986315727233887, + "learning_rate": 8.808848840741117e-05, + "loss": 1.9236, + "step": 2363 + }, + { + "epoch": 0.7175595689786007, + "grad_norm": 0.42691266536712646, + "learning_rate": 8.808342614154096e-05, + "loss": 2.0663, + "step": 2364 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.38523420691490173, + "learning_rate": 8.807836387567076e-05, + "loss": 1.9165, + "step": 2365 + }, + { + "epoch": 0.7181666413719836, + "grad_norm": 0.29487428069114685, + "learning_rate": 8.807330160980055e-05, + "loss": 1.4763, + "step": 2366 + }, + { + "epoch": 0.718470177568675, + "grad_norm": 0.9072676301002502, + "learning_rate": 8.806823934393035e-05, + "loss": 2.0882, + "step": 2367 + }, + { + "epoch": 0.7187737137653665, + "grad_norm": 0.37307825684547424, + "learning_rate": 8.806317707806014e-05, + "loss": 1.4694, + "step": 2368 + }, + { + "epoch": 0.719077249962058, + "grad_norm": 0.41390106081962585, + "learning_rate": 8.805811481218994e-05, + "loss": 1.6849, + "step": 2369 + }, + { + "epoch": 0.7193807861587495, + "grad_norm": 0.4989478886127472, + "learning_rate": 8.805305254631973e-05, + "loss": 2.146, + "step": 2370 + }, + { + "epoch": 0.7196843223554409, + "grad_norm": 0.4021719694137573, + "learning_rate": 8.804799028044953e-05, + "loss": 1.942, + "step": 2371 + }, + { + "epoch": 0.7199878585521323, + "grad_norm": 0.4169461727142334, + "learning_rate": 8.804292801457933e-05, + "loss": 2.0278, + "step": 2372 + }, + { + "epoch": 0.7202913947488238, + "grad_norm": 0.39091089367866516, + "learning_rate": 8.803786574870913e-05, + "loss": 1.9644, + "step": 2373 + }, + { + "epoch": 0.7205949309455153, + "grad_norm": 0.45431414246559143, + "learning_rate": 8.803280348283892e-05, + "loss": 1.5611, + "step": 2374 + }, + { + "epoch": 0.7208984671422067, + "grad_norm": 0.3896774351596832, + "learning_rate": 8.802774121696872e-05, + "loss": 1.7838, + "step": 2375 + }, + { + "epoch": 0.7212020033388982, + "grad_norm": 0.4076644778251648, + "learning_rate": 8.802267895109851e-05, + "loss": 1.9717, + "step": 2376 + }, + { + "epoch": 0.7215055395355896, + "grad_norm": 0.4065254032611847, + "learning_rate": 8.801761668522831e-05, + "loss": 1.6598, + "step": 2377 + }, + { + "epoch": 0.721809075732281, + "grad_norm": 0.6506657004356384, + "learning_rate": 8.80125544193581e-05, + "loss": 1.9463, + "step": 2378 + }, + { + "epoch": 0.7221126119289726, + "grad_norm": 0.46132421493530273, + "learning_rate": 8.80074921534879e-05, + "loss": 2.0071, + "step": 2379 + }, + { + "epoch": 0.722416148125664, + "grad_norm": 0.3932840824127197, + "learning_rate": 8.800242988761769e-05, + "loss": 1.9956, + "step": 2380 + }, + { + "epoch": 0.7227196843223554, + "grad_norm": 0.4919872581958771, + "learning_rate": 8.79973676217475e-05, + "loss": 1.658, + "step": 2381 + }, + { + "epoch": 0.7230232205190469, + "grad_norm": 0.4147129952907562, + "learning_rate": 8.79923053558773e-05, + "loss": 1.9331, + "step": 2382 + }, + { + "epoch": 0.7233267567157383, + "grad_norm": 0.4280264973640442, + "learning_rate": 8.798724309000709e-05, + "loss": 1.7016, + "step": 2383 + }, + { + "epoch": 0.7236302929124299, + "grad_norm": 0.4554193913936615, + "learning_rate": 8.798218082413689e-05, + "loss": 2.08, + "step": 2384 + }, + { + "epoch": 0.7239338291091213, + "grad_norm": 0.4477219581604004, + "learning_rate": 8.797711855826668e-05, + "loss": 1.6204, + "step": 2385 + }, + { + "epoch": 0.7242373653058127, + "grad_norm": 0.32487139105796814, + "learning_rate": 8.797205629239648e-05, + "loss": 1.7271, + "step": 2386 + }, + { + "epoch": 0.7245409015025042, + "grad_norm": 2.3734400272369385, + "learning_rate": 8.796699402652627e-05, + "loss": 2.3315, + "step": 2387 + }, + { + "epoch": 0.7248444376991956, + "grad_norm": 0.3860095739364624, + "learning_rate": 8.796193176065606e-05, + "loss": 1.4387, + "step": 2388 + }, + { + "epoch": 0.725147973895887, + "grad_norm": 0.3950817286968231, + "learning_rate": 8.795686949478586e-05, + "loss": 2.1907, + "step": 2389 + }, + { + "epoch": 0.7254515100925786, + "grad_norm": 0.37350189685821533, + "learning_rate": 8.795180722891567e-05, + "loss": 1.5662, + "step": 2390 + }, + { + "epoch": 0.72575504628927, + "grad_norm": 0.46780696511268616, + "learning_rate": 8.794674496304546e-05, + "loss": 1.4264, + "step": 2391 + }, + { + "epoch": 0.7260585824859614, + "grad_norm": 0.3911786377429962, + "learning_rate": 8.794168269717527e-05, + "loss": 1.8752, + "step": 2392 + }, + { + "epoch": 0.7263621186826529, + "grad_norm": 0.5619503855705261, + "learning_rate": 8.793662043130507e-05, + "loss": 2.0974, + "step": 2393 + }, + { + "epoch": 0.7266656548793443, + "grad_norm": 0.44586917757987976, + "learning_rate": 8.793155816543486e-05, + "loss": 2.0871, + "step": 2394 + }, + { + "epoch": 0.7269691910760359, + "grad_norm": 0.3445717990398407, + "learning_rate": 8.792649589956466e-05, + "loss": 1.3626, + "step": 2395 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.410279244184494, + "learning_rate": 8.792143363369445e-05, + "loss": 1.5149, + "step": 2396 + }, + { + "epoch": 0.7275762634694187, + "grad_norm": 0.3766261339187622, + "learning_rate": 8.791637136782425e-05, + "loss": 1.8248, + "step": 2397 + }, + { + "epoch": 0.7278797996661102, + "grad_norm": 0.3978814482688904, + "learning_rate": 8.791130910195404e-05, + "loss": 1.9437, + "step": 2398 + }, + { + "epoch": 0.7281833358628016, + "grad_norm": 0.36028674244880676, + "learning_rate": 8.790624683608383e-05, + "loss": 1.8169, + "step": 2399 + }, + { + "epoch": 0.7284868720594931, + "grad_norm": 0.38613566756248474, + "learning_rate": 8.790118457021363e-05, + "loss": 1.3976, + "step": 2400 + }, + { + "epoch": 0.7287904082561846, + "grad_norm": 0.4028817117214203, + "learning_rate": 8.789612230434344e-05, + "loss": 1.5574, + "step": 2401 + }, + { + "epoch": 0.729093944452876, + "grad_norm": 0.4536430239677429, + "learning_rate": 8.789106003847323e-05, + "loss": 1.7994, + "step": 2402 + }, + { + "epoch": 0.7293974806495674, + "grad_norm": 0.421176016330719, + "learning_rate": 8.788599777260303e-05, + "loss": 2.1671, + "step": 2403 + }, + { + "epoch": 0.7297010168462589, + "grad_norm": 0.42854103446006775, + "learning_rate": 8.788093550673282e-05, + "loss": 1.5606, + "step": 2404 + }, + { + "epoch": 0.7300045530429504, + "grad_norm": 0.38108232617378235, + "learning_rate": 8.787587324086262e-05, + "loss": 1.8415, + "step": 2405 + }, + { + "epoch": 0.7303080892396419, + "grad_norm": 0.454464852809906, + "learning_rate": 8.787081097499241e-05, + "loss": 1.5911, + "step": 2406 + }, + { + "epoch": 0.7306116254363333, + "grad_norm": 0.4082881808280945, + "learning_rate": 8.78657487091222e-05, + "loss": 2.022, + "step": 2407 + }, + { + "epoch": 0.7309151616330247, + "grad_norm": 0.4951760470867157, + "learning_rate": 8.7860686443252e-05, + "loss": 2.12, + "step": 2408 + }, + { + "epoch": 0.7312186978297162, + "grad_norm": 0.40377724170684814, + "learning_rate": 8.78556241773818e-05, + "loss": 2.0016, + "step": 2409 + }, + { + "epoch": 0.7315222340264077, + "grad_norm": 0.403481662273407, + "learning_rate": 8.785056191151159e-05, + "loss": 1.9914, + "step": 2410 + }, + { + "epoch": 0.7318257702230991, + "grad_norm": 0.4195014536380768, + "learning_rate": 8.78454996456414e-05, + "loss": 1.8749, + "step": 2411 + }, + { + "epoch": 0.7321293064197906, + "grad_norm": 0.40575090050697327, + "learning_rate": 8.78404373797712e-05, + "loss": 1.8565, + "step": 2412 + }, + { + "epoch": 0.732432842616482, + "grad_norm": 0.4025145173072815, + "learning_rate": 8.783537511390099e-05, + "loss": 1.7397, + "step": 2413 + }, + { + "epoch": 0.7327363788131734, + "grad_norm": 0.35525646805763245, + "learning_rate": 8.783031284803078e-05, + "loss": 2.086, + "step": 2414 + }, + { + "epoch": 0.7330399150098649, + "grad_norm": 0.4063604772090912, + "learning_rate": 8.782525058216058e-05, + "loss": 1.8643, + "step": 2415 + }, + { + "epoch": 0.7333434512065564, + "grad_norm": 0.3689418137073517, + "learning_rate": 8.782018831629037e-05, + "loss": 1.9982, + "step": 2416 + }, + { + "epoch": 0.7336469874032479, + "grad_norm": 0.4066859185695648, + "learning_rate": 8.781512605042017e-05, + "loss": 1.4861, + "step": 2417 + }, + { + "epoch": 0.7339505235999393, + "grad_norm": 0.4118275046348572, + "learning_rate": 8.781006378454996e-05, + "loss": 1.923, + "step": 2418 + }, + { + "epoch": 0.7342540597966307, + "grad_norm": 0.4238114058971405, + "learning_rate": 8.780500151867976e-05, + "loss": 1.7656, + "step": 2419 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.4456924498081207, + "learning_rate": 8.779993925280957e-05, + "loss": 1.3612, + "step": 2420 + }, + { + "epoch": 0.7348611321900137, + "grad_norm": 0.3890600800514221, + "learning_rate": 8.779487698693936e-05, + "loss": 1.7588, + "step": 2421 + }, + { + "epoch": 0.7351646683867051, + "grad_norm": 0.35149431228637695, + "learning_rate": 8.778981472106916e-05, + "loss": 1.9639, + "step": 2422 + }, + { + "epoch": 0.7354682045833966, + "grad_norm": 0.43158209323883057, + "learning_rate": 8.778475245519895e-05, + "loss": 2.008, + "step": 2423 + }, + { + "epoch": 0.735771740780088, + "grad_norm": 0.4337812066078186, + "learning_rate": 8.777969018932875e-05, + "loss": 1.9703, + "step": 2424 + }, + { + "epoch": 0.7360752769767794, + "grad_norm": 0.36363622546195984, + "learning_rate": 8.777462792345854e-05, + "loss": 1.712, + "step": 2425 + }, + { + "epoch": 0.736378813173471, + "grad_norm": 0.3731040358543396, + "learning_rate": 8.776956565758833e-05, + "loss": 2.11, + "step": 2426 + }, + { + "epoch": 0.7366823493701624, + "grad_norm": 0.37047722935676575, + "learning_rate": 8.776450339171813e-05, + "loss": 2.1481, + "step": 2427 + }, + { + "epoch": 0.7369858855668538, + "grad_norm": 1.4641270637512207, + "learning_rate": 8.775944112584792e-05, + "loss": 1.9506, + "step": 2428 + }, + { + "epoch": 0.7372894217635453, + "grad_norm": 0.3781220018863678, + "learning_rate": 8.775437885997773e-05, + "loss": 1.9651, + "step": 2429 + }, + { + "epoch": 0.7375929579602367, + "grad_norm": 0.353572815656662, + "learning_rate": 8.774931659410753e-05, + "loss": 1.8171, + "step": 2430 + }, + { + "epoch": 0.7378964941569283, + "grad_norm": 0.4101322889328003, + "learning_rate": 8.774425432823732e-05, + "loss": 1.1176, + "step": 2431 + }, + { + "epoch": 0.7382000303536197, + "grad_norm": 0.3967879116535187, + "learning_rate": 8.773919206236712e-05, + "loss": 1.9563, + "step": 2432 + }, + { + "epoch": 0.7385035665503111, + "grad_norm": 0.41269639134407043, + "learning_rate": 8.773412979649691e-05, + "loss": 1.8312, + "step": 2433 + }, + { + "epoch": 0.7388071027470026, + "grad_norm": 0.4125451445579529, + "learning_rate": 8.77290675306267e-05, + "loss": 1.9907, + "step": 2434 + }, + { + "epoch": 0.739110638943694, + "grad_norm": 0.447773814201355, + "learning_rate": 8.772400526475652e-05, + "loss": 2.0819, + "step": 2435 + }, + { + "epoch": 0.7394141751403855, + "grad_norm": 0.3990137279033661, + "learning_rate": 8.771894299888631e-05, + "loss": 1.5107, + "step": 2436 + }, + { + "epoch": 0.739717711337077, + "grad_norm": 0.56345534324646, + "learning_rate": 8.77138807330161e-05, + "loss": 1.6428, + "step": 2437 + }, + { + "epoch": 0.7400212475337684, + "grad_norm": 0.42566823959350586, + "learning_rate": 8.77088184671459e-05, + "loss": 1.5812, + "step": 2438 + }, + { + "epoch": 0.7403247837304598, + "grad_norm": 0.4182227551937103, + "learning_rate": 8.77037562012757e-05, + "loss": 1.8838, + "step": 2439 + }, + { + "epoch": 0.7406283199271513, + "grad_norm": 0.3614279627799988, + "learning_rate": 8.76986939354055e-05, + "loss": 1.8822, + "step": 2440 + }, + { + "epoch": 0.7409318561238427, + "grad_norm": 0.4376552104949951, + "learning_rate": 8.76936316695353e-05, + "loss": 2.0327, + "step": 2441 + }, + { + "epoch": 0.7412353923205343, + "grad_norm": 0.3294520676136017, + "learning_rate": 8.768856940366509e-05, + "loss": 1.7045, + "step": 2442 + }, + { + "epoch": 0.7415389285172257, + "grad_norm": 0.39772239327430725, + "learning_rate": 8.768350713779489e-05, + "loss": 1.7068, + "step": 2443 + }, + { + "epoch": 0.7418424647139171, + "grad_norm": 0.4332139194011688, + "learning_rate": 8.767844487192468e-05, + "loss": 1.5432, + "step": 2444 + }, + { + "epoch": 0.7421460009106086, + "grad_norm": 0.40865209698677063, + "learning_rate": 8.767338260605448e-05, + "loss": 2.0378, + "step": 2445 + }, + { + "epoch": 0.7424495371073, + "grad_norm": 0.3608027696609497, + "learning_rate": 8.766832034018427e-05, + "loss": 1.4315, + "step": 2446 + }, + { + "epoch": 0.7427530733039915, + "grad_norm": 0.38700732588768005, + "learning_rate": 8.766325807431407e-05, + "loss": 1.1194, + "step": 2447 + }, + { + "epoch": 0.743056609500683, + "grad_norm": 0.4182412624359131, + "learning_rate": 8.765819580844386e-05, + "loss": 2.0976, + "step": 2448 + }, + { + "epoch": 0.7433601456973744, + "grad_norm": 0.40817487239837646, + "learning_rate": 8.765313354257366e-05, + "loss": 1.9548, + "step": 2449 + }, + { + "epoch": 0.7436636818940658, + "grad_norm": 0.4414690434932709, + "learning_rate": 8.764807127670346e-05, + "loss": 1.8551, + "step": 2450 + }, + { + "epoch": 0.7439672180907573, + "grad_norm": 0.393435001373291, + "learning_rate": 8.764300901083326e-05, + "loss": 1.7419, + "step": 2451 + }, + { + "epoch": 0.7442707542874488, + "grad_norm": 0.36712646484375, + "learning_rate": 8.763794674496305e-05, + "loss": 1.9663, + "step": 2452 + }, + { + "epoch": 0.7445742904841403, + "grad_norm": 0.47254228591918945, + "learning_rate": 8.763288447909285e-05, + "loss": 1.6821, + "step": 2453 + }, + { + "epoch": 0.7448778266808317, + "grad_norm": 0.6918731927871704, + "learning_rate": 8.762782221322264e-05, + "loss": 1.4488, + "step": 2454 + }, + { + "epoch": 0.7451813628775231, + "grad_norm": 0.4374895393848419, + "learning_rate": 8.762275994735244e-05, + "loss": 1.8773, + "step": 2455 + }, + { + "epoch": 0.7454848990742146, + "grad_norm": 0.3807445466518402, + "learning_rate": 8.761769768148223e-05, + "loss": 1.9019, + "step": 2456 + }, + { + "epoch": 0.7457884352709061, + "grad_norm": 0.804283857345581, + "learning_rate": 8.761263541561203e-05, + "loss": 1.6503, + "step": 2457 + }, + { + "epoch": 0.7460919714675975, + "grad_norm": 0.4001246690750122, + "learning_rate": 8.760757314974182e-05, + "loss": 1.8858, + "step": 2458 + }, + { + "epoch": 0.746395507664289, + "grad_norm": 0.4548395574092865, + "learning_rate": 8.760251088387163e-05, + "loss": 1.4205, + "step": 2459 + }, + { + "epoch": 0.7466990438609804, + "grad_norm": 0.4249577522277832, + "learning_rate": 8.759744861800143e-05, + "loss": 2.0217, + "step": 2460 + }, + { + "epoch": 0.7470025800576718, + "grad_norm": 0.356995165348053, + "learning_rate": 8.759238635213122e-05, + "loss": 2.1138, + "step": 2461 + }, + { + "epoch": 0.7473061162543634, + "grad_norm": 0.39245614409446716, + "learning_rate": 8.758732408626102e-05, + "loss": 1.9274, + "step": 2462 + }, + { + "epoch": 0.7476096524510548, + "grad_norm": 0.5045961737632751, + "learning_rate": 8.758226182039081e-05, + "loss": 1.5911, + "step": 2463 + }, + { + "epoch": 0.7479131886477463, + "grad_norm": 0.4416704773902893, + "learning_rate": 8.75771995545206e-05, + "loss": 1.7762, + "step": 2464 + }, + { + "epoch": 0.7482167248444377, + "grad_norm": 0.6794231534004211, + "learning_rate": 8.75721372886504e-05, + "loss": 2.1296, + "step": 2465 + }, + { + "epoch": 0.7485202610411291, + "grad_norm": 0.4514855444431305, + "learning_rate": 8.75670750227802e-05, + "loss": 1.9012, + "step": 2466 + }, + { + "epoch": 0.7488237972378207, + "grad_norm": 0.3483482003211975, + "learning_rate": 8.756201275690999e-05, + "loss": 1.3373, + "step": 2467 + }, + { + "epoch": 0.7491273334345121, + "grad_norm": 0.4310845136642456, + "learning_rate": 8.75569504910398e-05, + "loss": 1.7987, + "step": 2468 + }, + { + "epoch": 0.7494308696312035, + "grad_norm": 0.39404624700546265, + "learning_rate": 8.755188822516959e-05, + "loss": 1.764, + "step": 2469 + }, + { + "epoch": 0.749734405827895, + "grad_norm": 0.39560645818710327, + "learning_rate": 8.754682595929939e-05, + "loss": 1.7229, + "step": 2470 + }, + { + "epoch": 0.7500379420245864, + "grad_norm": 0.4125354588031769, + "learning_rate": 8.754176369342918e-05, + "loss": 1.6562, + "step": 2471 + }, + { + "epoch": 0.7503414782212778, + "grad_norm": 0.4781520366668701, + "learning_rate": 8.753670142755898e-05, + "loss": 2.0631, + "step": 2472 + }, + { + "epoch": 0.7506450144179694, + "grad_norm": 0.4587598443031311, + "learning_rate": 8.753163916168877e-05, + "loss": 1.7579, + "step": 2473 + }, + { + "epoch": 0.7509485506146608, + "grad_norm": 0.44834762811660767, + "learning_rate": 8.752657689581857e-05, + "loss": 1.8958, + "step": 2474 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3947749137878418, + "learning_rate": 8.752151462994836e-05, + "loss": 2.041, + "step": 2475 + }, + { + "epoch": 0.7515556230080437, + "grad_norm": 0.38898783922195435, + "learning_rate": 8.751645236407816e-05, + "loss": 1.8326, + "step": 2476 + }, + { + "epoch": 0.7518591592047351, + "grad_norm": 0.4535033404827118, + "learning_rate": 8.751139009820795e-05, + "loss": 1.7797, + "step": 2477 + }, + { + "epoch": 0.7521626954014267, + "grad_norm": 0.34348368644714355, + "learning_rate": 8.750632783233776e-05, + "loss": 1.8936, + "step": 2478 + }, + { + "epoch": 0.7524662315981181, + "grad_norm": 0.36187052726745605, + "learning_rate": 8.750126556646755e-05, + "loss": 1.4416, + "step": 2479 + }, + { + "epoch": 0.7527697677948095, + "grad_norm": 0.4151141941547394, + "learning_rate": 8.749620330059736e-05, + "loss": 2.0148, + "step": 2480 + }, + { + "epoch": 0.753073303991501, + "grad_norm": 0.39229243993759155, + "learning_rate": 8.749114103472716e-05, + "loss": 1.622, + "step": 2481 + }, + { + "epoch": 0.7533768401881924, + "grad_norm": 0.44165119528770447, + "learning_rate": 8.748607876885695e-05, + "loss": 2.2657, + "step": 2482 + }, + { + "epoch": 0.7536803763848839, + "grad_norm": 0.5234296917915344, + "learning_rate": 8.748101650298675e-05, + "loss": 1.6954, + "step": 2483 + }, + { + "epoch": 0.7539839125815754, + "grad_norm": 0.4218185842037201, + "learning_rate": 8.747595423711654e-05, + "loss": 2.028, + "step": 2484 + }, + { + "epoch": 0.7542874487782668, + "grad_norm": 0.6535462737083435, + "learning_rate": 8.747089197124634e-05, + "loss": 1.9042, + "step": 2485 + }, + { + "epoch": 0.7545909849749582, + "grad_norm": 0.34253132343292236, + "learning_rate": 8.746582970537613e-05, + "loss": 1.5497, + "step": 2486 + }, + { + "epoch": 0.7548945211716497, + "grad_norm": 0.4396836757659912, + "learning_rate": 8.746076743950593e-05, + "loss": 1.912, + "step": 2487 + }, + { + "epoch": 0.7551980573683412, + "grad_norm": 0.44126465916633606, + "learning_rate": 8.745570517363572e-05, + "loss": 1.7907, + "step": 2488 + }, + { + "epoch": 0.7555015935650327, + "grad_norm": 0.42292916774749756, + "learning_rate": 8.745064290776553e-05, + "loss": 1.9956, + "step": 2489 + }, + { + "epoch": 0.7558051297617241, + "grad_norm": 0.4493507146835327, + "learning_rate": 8.744558064189532e-05, + "loss": 1.9819, + "step": 2490 + }, + { + "epoch": 0.7561086659584155, + "grad_norm": 0.5793929100036621, + "learning_rate": 8.744051837602512e-05, + "loss": 2.1241, + "step": 2491 + }, + { + "epoch": 0.756412202155107, + "grad_norm": 0.3927520513534546, + "learning_rate": 8.743545611015491e-05, + "loss": 1.6615, + "step": 2492 + }, + { + "epoch": 0.7567157383517985, + "grad_norm": 0.4623410403728485, + "learning_rate": 8.743039384428471e-05, + "loss": 1.9174, + "step": 2493 + }, + { + "epoch": 0.7570192745484899, + "grad_norm": 0.4135148823261261, + "learning_rate": 8.74253315784145e-05, + "loss": 1.9403, + "step": 2494 + }, + { + "epoch": 0.7573228107451814, + "grad_norm": 0.4701920449733734, + "learning_rate": 8.74202693125443e-05, + "loss": 1.9479, + "step": 2495 + }, + { + "epoch": 0.7576263469418728, + "grad_norm": 0.42168691754341125, + "learning_rate": 8.741520704667409e-05, + "loss": 1.6007, + "step": 2496 + }, + { + "epoch": 0.7579298831385642, + "grad_norm": 1.0131754875183105, + "learning_rate": 8.741014478080389e-05, + "loss": 1.8092, + "step": 2497 + }, + { + "epoch": 0.7582334193352557, + "grad_norm": 0.39457446336746216, + "learning_rate": 8.74050825149337e-05, + "loss": 2.0884, + "step": 2498 + }, + { + "epoch": 0.7585369555319472, + "grad_norm": 0.9458907842636108, + "learning_rate": 8.740002024906349e-05, + "loss": 1.5426, + "step": 2499 + }, + { + "epoch": 0.7588404917286387, + "grad_norm": 0.6375271677970886, + "learning_rate": 8.739495798319329e-05, + "loss": 1.9517, + "step": 2500 + }, + { + "epoch": 0.7591440279253301, + "grad_norm": 0.4703015089035034, + "learning_rate": 8.738989571732308e-05, + "loss": 1.8815, + "step": 2501 + }, + { + "epoch": 0.7594475641220215, + "grad_norm": 0.4868961572647095, + "learning_rate": 8.738483345145287e-05, + "loss": 2.1519, + "step": 2502 + }, + { + "epoch": 0.759751100318713, + "grad_norm": 0.3808225691318512, + "learning_rate": 8.737977118558267e-05, + "loss": 1.9137, + "step": 2503 + }, + { + "epoch": 0.7600546365154045, + "grad_norm": 0.4780614674091339, + "learning_rate": 8.737470891971246e-05, + "loss": 2.0755, + "step": 2504 + }, + { + "epoch": 0.7603581727120959, + "grad_norm": 0.8534510135650635, + "learning_rate": 8.736964665384226e-05, + "loss": 2.2839, + "step": 2505 + }, + { + "epoch": 0.7606617089087874, + "grad_norm": 0.3620678782463074, + "learning_rate": 8.736458438797205e-05, + "loss": 1.9497, + "step": 2506 + }, + { + "epoch": 0.7609652451054788, + "grad_norm": 0.40993764996528625, + "learning_rate": 8.735952212210186e-05, + "loss": 1.7032, + "step": 2507 + }, + { + "epoch": 0.7612687813021702, + "grad_norm": 0.43798285722732544, + "learning_rate": 8.735445985623166e-05, + "loss": 1.8932, + "step": 2508 + }, + { + "epoch": 0.7615723174988618, + "grad_norm": 0.8272436857223511, + "learning_rate": 8.734939759036145e-05, + "loss": 1.9612, + "step": 2509 + }, + { + "epoch": 0.7618758536955532, + "grad_norm": 0.3841719329357147, + "learning_rate": 8.734433532449125e-05, + "loss": 1.8698, + "step": 2510 + }, + { + "epoch": 0.7621793898922447, + "grad_norm": 0.459075391292572, + "learning_rate": 8.733927305862104e-05, + "loss": 1.8878, + "step": 2511 + }, + { + "epoch": 0.7624829260889361, + "grad_norm": 0.41815492510795593, + "learning_rate": 8.733421079275084e-05, + "loss": 1.8751, + "step": 2512 + }, + { + "epoch": 0.7627864622856275, + "grad_norm": 0.41531050205230713, + "learning_rate": 8.732914852688063e-05, + "loss": 1.8247, + "step": 2513 + }, + { + "epoch": 0.7630899984823191, + "grad_norm": 0.36942997574806213, + "learning_rate": 8.732408626101043e-05, + "loss": 2.0158, + "step": 2514 + }, + { + "epoch": 0.7633935346790105, + "grad_norm": 0.3985773026943207, + "learning_rate": 8.731902399514022e-05, + "loss": 2.0955, + "step": 2515 + }, + { + "epoch": 0.7636970708757019, + "grad_norm": 0.45657238364219666, + "learning_rate": 8.731396172927002e-05, + "loss": 1.2635, + "step": 2516 + }, + { + "epoch": 0.7640006070723934, + "grad_norm": 0.35013964772224426, + "learning_rate": 8.730889946339982e-05, + "loss": 2.1223, + "step": 2517 + }, + { + "epoch": 0.7643041432690848, + "grad_norm": 0.48166340589523315, + "learning_rate": 8.730383719752962e-05, + "loss": 1.9952, + "step": 2518 + }, + { + "epoch": 0.7646076794657763, + "grad_norm": 0.3770373463630676, + "learning_rate": 8.729877493165941e-05, + "loss": 2.0698, + "step": 2519 + }, + { + "epoch": 0.7649112156624678, + "grad_norm": 0.6299264430999756, + "learning_rate": 8.729371266578921e-05, + "loss": 2.1133, + "step": 2520 + }, + { + "epoch": 0.7652147518591592, + "grad_norm": 0.3834339380264282, + "learning_rate": 8.7288650399919e-05, + "loss": 1.6254, + "step": 2521 + }, + { + "epoch": 0.7655182880558506, + "grad_norm": 0.4225000739097595, + "learning_rate": 8.72835881340488e-05, + "loss": 1.5402, + "step": 2522 + }, + { + "epoch": 0.7658218242525421, + "grad_norm": 0.3836756646633148, + "learning_rate": 8.727852586817859e-05, + "loss": 1.8559, + "step": 2523 + }, + { + "epoch": 0.7661253604492335, + "grad_norm": 0.43883371353149414, + "learning_rate": 8.72734636023084e-05, + "loss": 1.8589, + "step": 2524 + }, + { + "epoch": 0.7664288966459251, + "grad_norm": 0.3844871520996094, + "learning_rate": 8.72684013364382e-05, + "loss": 1.9853, + "step": 2525 + }, + { + "epoch": 0.7667324328426165, + "grad_norm": 0.37431496381759644, + "learning_rate": 8.726333907056799e-05, + "loss": 1.9808, + "step": 2526 + }, + { + "epoch": 0.7670359690393079, + "grad_norm": 0.35484790802001953, + "learning_rate": 8.725827680469779e-05, + "loss": 1.2544, + "step": 2527 + }, + { + "epoch": 0.7673395052359994, + "grad_norm": 0.3555900752544403, + "learning_rate": 8.72532145388276e-05, + "loss": 1.3952, + "step": 2528 + }, + { + "epoch": 0.7676430414326908, + "grad_norm": 0.4385487139225006, + "learning_rate": 8.724815227295739e-05, + "loss": 2.1207, + "step": 2529 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.41865015029907227, + "learning_rate": 8.724309000708718e-05, + "loss": 1.2081, + "step": 2530 + }, + { + "epoch": 0.7682501138260738, + "grad_norm": 0.44620874524116516, + "learning_rate": 8.723802774121698e-05, + "loss": 1.7552, + "step": 2531 + }, + { + "epoch": 0.7685536500227652, + "grad_norm": 0.42884379625320435, + "learning_rate": 8.723296547534677e-05, + "loss": 1.8443, + "step": 2532 + }, + { + "epoch": 0.7688571862194566, + "grad_norm": 0.41244685649871826, + "learning_rate": 8.722790320947657e-05, + "loss": 1.9597, + "step": 2533 + }, + { + "epoch": 0.7691607224161481, + "grad_norm": 0.7400226593017578, + "learning_rate": 8.722284094360636e-05, + "loss": 1.9847, + "step": 2534 + }, + { + "epoch": 0.7694642586128396, + "grad_norm": 0.4088320732116699, + "learning_rate": 8.721777867773616e-05, + "loss": 1.5025, + "step": 2535 + }, + { + "epoch": 0.7697677948095311, + "grad_norm": 0.4008265435695648, + "learning_rate": 8.721271641186595e-05, + "loss": 1.879, + "step": 2536 + }, + { + "epoch": 0.7700713310062225, + "grad_norm": 0.3870142996311188, + "learning_rate": 8.720765414599576e-05, + "loss": 1.9367, + "step": 2537 + }, + { + "epoch": 0.7703748672029139, + "grad_norm": 0.4387873411178589, + "learning_rate": 8.720259188012556e-05, + "loss": 2.2303, + "step": 2538 + }, + { + "epoch": 0.7706784033996054, + "grad_norm": 0.707614541053772, + "learning_rate": 8.719752961425535e-05, + "loss": 1.5612, + "step": 2539 + }, + { + "epoch": 0.7709819395962969, + "grad_norm": 0.43096137046813965, + "learning_rate": 8.719246734838514e-05, + "loss": 1.544, + "step": 2540 + }, + { + "epoch": 0.7712854757929883, + "grad_norm": 0.3771781027317047, + "learning_rate": 8.718740508251494e-05, + "loss": 1.79, + "step": 2541 + }, + { + "epoch": 0.7715890119896798, + "grad_norm": 0.39454761147499084, + "learning_rate": 8.718234281664473e-05, + "loss": 1.4474, + "step": 2542 + }, + { + "epoch": 0.7718925481863712, + "grad_norm": 0.421641081571579, + "learning_rate": 8.717728055077453e-05, + "loss": 1.8482, + "step": 2543 + }, + { + "epoch": 0.7721960843830626, + "grad_norm": 0.38047879934310913, + "learning_rate": 8.717221828490432e-05, + "loss": 1.2413, + "step": 2544 + }, + { + "epoch": 0.7724996205797542, + "grad_norm": 0.38516274094581604, + "learning_rate": 8.716715601903412e-05, + "loss": 1.9199, + "step": 2545 + }, + { + "epoch": 0.7728031567764456, + "grad_norm": 0.38349801301956177, + "learning_rate": 8.716209375316393e-05, + "loss": 1.999, + "step": 2546 + }, + { + "epoch": 0.773106692973137, + "grad_norm": 0.5327167510986328, + "learning_rate": 8.715703148729372e-05, + "loss": 1.5738, + "step": 2547 + }, + { + "epoch": 0.7734102291698285, + "grad_norm": 0.3783544898033142, + "learning_rate": 8.715196922142352e-05, + "loss": 1.8474, + "step": 2548 + }, + { + "epoch": 0.7737137653665199, + "grad_norm": 0.509729266166687, + "learning_rate": 8.714690695555331e-05, + "loss": 2.2824, + "step": 2549 + }, + { + "epoch": 0.7740173015632115, + "grad_norm": 0.4439513087272644, + "learning_rate": 8.71418446896831e-05, + "loss": 1.8373, + "step": 2550 + }, + { + "epoch": 0.7743208377599029, + "grad_norm": 0.4309268891811371, + "learning_rate": 8.71367824238129e-05, + "loss": 1.3441, + "step": 2551 + }, + { + "epoch": 0.7746243739565943, + "grad_norm": 0.4033602178096771, + "learning_rate": 8.71317201579427e-05, + "loss": 2.0037, + "step": 2552 + }, + { + "epoch": 0.7749279101532858, + "grad_norm": 0.42097219824790955, + "learning_rate": 8.712665789207249e-05, + "loss": 2.0319, + "step": 2553 + }, + { + "epoch": 0.7752314463499772, + "grad_norm": 0.43752336502075195, + "learning_rate": 8.712159562620229e-05, + "loss": 1.6834, + "step": 2554 + }, + { + "epoch": 0.7755349825466686, + "grad_norm": 0.4009190499782562, + "learning_rate": 8.711653336033208e-05, + "loss": 1.8764, + "step": 2555 + }, + { + "epoch": 0.7758385187433602, + "grad_norm": 0.38049957156181335, + "learning_rate": 8.711147109446189e-05, + "loss": 1.5514, + "step": 2556 + }, + { + "epoch": 0.7761420549400516, + "grad_norm": 0.7045227289199829, + "learning_rate": 8.710640882859168e-05, + "loss": 2.045, + "step": 2557 + }, + { + "epoch": 0.776445591136743, + "grad_norm": 0.4141732454299927, + "learning_rate": 8.710134656272148e-05, + "loss": 1.9546, + "step": 2558 + }, + { + "epoch": 0.7767491273334345, + "grad_norm": 0.36503976583480835, + "learning_rate": 8.709628429685127e-05, + "loss": 1.8611, + "step": 2559 + }, + { + "epoch": 0.7770526635301259, + "grad_norm": 0.4061439335346222, + "learning_rate": 8.709122203098107e-05, + "loss": 1.5297, + "step": 2560 + }, + { + "epoch": 0.7773561997268175, + "grad_norm": 0.39136406779289246, + "learning_rate": 8.708615976511086e-05, + "loss": 1.7483, + "step": 2561 + }, + { + "epoch": 0.7776597359235089, + "grad_norm": 0.38786038756370544, + "learning_rate": 8.708109749924066e-05, + "loss": 1.6304, + "step": 2562 + }, + { + "epoch": 0.7779632721202003, + "grad_norm": 0.44066160917282104, + "learning_rate": 8.707603523337045e-05, + "loss": 1.8819, + "step": 2563 + }, + { + "epoch": 0.7782668083168918, + "grad_norm": 0.4141193628311157, + "learning_rate": 8.707097296750025e-05, + "loss": 1.5542, + "step": 2564 + }, + { + "epoch": 0.7785703445135832, + "grad_norm": 0.3722589910030365, + "learning_rate": 8.706591070163006e-05, + "loss": 1.792, + "step": 2565 + }, + { + "epoch": 0.7788738807102747, + "grad_norm": 0.4519922435283661, + "learning_rate": 8.706084843575985e-05, + "loss": 1.8324, + "step": 2566 + }, + { + "epoch": 0.7791774169069662, + "grad_norm": 0.41349706053733826, + "learning_rate": 8.705578616988964e-05, + "loss": 1.841, + "step": 2567 + }, + { + "epoch": 0.7794809531036576, + "grad_norm": 0.445417195558548, + "learning_rate": 8.705072390401944e-05, + "loss": 1.6788, + "step": 2568 + }, + { + "epoch": 0.779784489300349, + "grad_norm": 0.35337746143341064, + "learning_rate": 8.704566163814925e-05, + "loss": 1.5129, + "step": 2569 + }, + { + "epoch": 0.7800880254970405, + "grad_norm": 0.49805590510368347, + "learning_rate": 8.704059937227904e-05, + "loss": 1.7206, + "step": 2570 + }, + { + "epoch": 0.780391561693732, + "grad_norm": 0.3580697774887085, + "learning_rate": 8.703553710640884e-05, + "loss": 1.8458, + "step": 2571 + }, + { + "epoch": 0.7806950978904235, + "grad_norm": 0.557847797870636, + "learning_rate": 8.703047484053863e-05, + "loss": 1.824, + "step": 2572 + }, + { + "epoch": 0.7809986340871149, + "grad_norm": 1.6153925657272339, + "learning_rate": 8.702541257466843e-05, + "loss": 2.0839, + "step": 2573 + }, + { + "epoch": 0.7813021702838063, + "grad_norm": 0.44338542222976685, + "learning_rate": 8.702035030879822e-05, + "loss": 1.8798, + "step": 2574 + }, + { + "epoch": 0.7816057064804978, + "grad_norm": 0.4379113018512726, + "learning_rate": 8.701528804292802e-05, + "loss": 1.8003, + "step": 2575 + }, + { + "epoch": 0.7819092426771893, + "grad_norm": 0.42209142446517944, + "learning_rate": 8.701022577705783e-05, + "loss": 1.1716, + "step": 2576 + }, + { + "epoch": 0.7822127788738807, + "grad_norm": 0.4423658549785614, + "learning_rate": 8.700516351118762e-05, + "loss": 1.9534, + "step": 2577 + }, + { + "epoch": 0.7825163150705722, + "grad_norm": 0.4544404149055481, + "learning_rate": 8.700010124531741e-05, + "loss": 1.3336, + "step": 2578 + }, + { + "epoch": 0.7828198512672636, + "grad_norm": 0.34568536281585693, + "learning_rate": 8.699503897944721e-05, + "loss": 1.9537, + "step": 2579 + }, + { + "epoch": 0.783123387463955, + "grad_norm": 0.545414924621582, + "learning_rate": 8.6989976713577e-05, + "loss": 1.9132, + "step": 2580 + }, + { + "epoch": 0.7834269236606465, + "grad_norm": 0.4345841705799103, + "learning_rate": 8.69849144477068e-05, + "loss": 1.7581, + "step": 2581 + }, + { + "epoch": 0.783730459857338, + "grad_norm": 0.4052067995071411, + "learning_rate": 8.69798521818366e-05, + "loss": 1.7303, + "step": 2582 + }, + { + "epoch": 0.7840339960540295, + "grad_norm": 0.34817397594451904, + "learning_rate": 8.697478991596639e-05, + "loss": 1.8363, + "step": 2583 + }, + { + "epoch": 0.7843375322507209, + "grad_norm": 0.3445320725440979, + "learning_rate": 8.696972765009618e-05, + "loss": 1.9267, + "step": 2584 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.43368205428123474, + "learning_rate": 8.696466538422599e-05, + "loss": 2.0271, + "step": 2585 + }, + { + "epoch": 0.7849446046441038, + "grad_norm": 0.4825034439563751, + "learning_rate": 8.695960311835579e-05, + "loss": 2.0727, + "step": 2586 + }, + { + "epoch": 0.7852481408407953, + "grad_norm": 0.6833105087280273, + "learning_rate": 8.695454085248558e-05, + "loss": 1.5662, + "step": 2587 + }, + { + "epoch": 0.7855516770374867, + "grad_norm": 0.3476558029651642, + "learning_rate": 8.694947858661538e-05, + "loss": 1.9023, + "step": 2588 + }, + { + "epoch": 0.7858552132341782, + "grad_norm": 0.49442049860954285, + "learning_rate": 8.694441632074517e-05, + "loss": 1.6733, + "step": 2589 + }, + { + "epoch": 0.7861587494308696, + "grad_norm": 1.6963638067245483, + "learning_rate": 8.693935405487497e-05, + "loss": 2.0808, + "step": 2590 + }, + { + "epoch": 0.786462285627561, + "grad_norm": 0.48170772194862366, + "learning_rate": 8.693429178900476e-05, + "loss": 1.6269, + "step": 2591 + }, + { + "epoch": 0.7867658218242526, + "grad_norm": 0.427327036857605, + "learning_rate": 8.692922952313456e-05, + "loss": 2.0448, + "step": 2592 + }, + { + "epoch": 0.787069358020944, + "grad_norm": 0.3641161322593689, + "learning_rate": 8.692416725726435e-05, + "loss": 2.0699, + "step": 2593 + }, + { + "epoch": 0.7873728942176355, + "grad_norm": 0.4324423372745514, + "learning_rate": 8.691910499139414e-05, + "loss": 1.1989, + "step": 2594 + }, + { + "epoch": 0.7876764304143269, + "grad_norm": 0.4303852617740631, + "learning_rate": 8.691404272552395e-05, + "loss": 1.866, + "step": 2595 + }, + { + "epoch": 0.7879799666110183, + "grad_norm": 0.36840641498565674, + "learning_rate": 8.690898045965375e-05, + "loss": 2.0927, + "step": 2596 + }, + { + "epoch": 0.7882835028077099, + "grad_norm": 0.43906763195991516, + "learning_rate": 8.690391819378354e-05, + "loss": 1.8755, + "step": 2597 + }, + { + "epoch": 0.7885870390044013, + "grad_norm": 0.43337517976760864, + "learning_rate": 8.689885592791334e-05, + "loss": 1.7263, + "step": 2598 + }, + { + "epoch": 0.7888905752010927, + "grad_norm": 0.35808295011520386, + "learning_rate": 8.689379366204313e-05, + "loss": 1.5395, + "step": 2599 + }, + { + "epoch": 0.7891941113977842, + "grad_norm": 0.4063914120197296, + "learning_rate": 8.688873139617293e-05, + "loss": 1.7125, + "step": 2600 + }, + { + "epoch": 0.7894976475944756, + "grad_norm": 0.35243427753448486, + "learning_rate": 8.688366913030272e-05, + "loss": 1.705, + "step": 2601 + }, + { + "epoch": 0.7898011837911671, + "grad_norm": 0.4404586851596832, + "learning_rate": 8.687860686443252e-05, + "loss": 1.9805, + "step": 2602 + }, + { + "epoch": 0.7901047199878586, + "grad_norm": 0.45531004667282104, + "learning_rate": 8.687354459856231e-05, + "loss": 2.1248, + "step": 2603 + }, + { + "epoch": 0.79040825618455, + "grad_norm": 0.4575786292552948, + "learning_rate": 8.686848233269212e-05, + "loss": 2.0493, + "step": 2604 + }, + { + "epoch": 0.7907117923812415, + "grad_norm": 0.4143056571483612, + "learning_rate": 8.686342006682191e-05, + "loss": 1.7757, + "step": 2605 + }, + { + "epoch": 0.7910153285779329, + "grad_norm": 0.41257745027542114, + "learning_rate": 8.685835780095171e-05, + "loss": 2.1223, + "step": 2606 + }, + { + "epoch": 0.7913188647746243, + "grad_norm": 0.4308036267757416, + "learning_rate": 8.68532955350815e-05, + "loss": 1.8967, + "step": 2607 + }, + { + "epoch": 0.7916224009713159, + "grad_norm": 1.4339756965637207, + "learning_rate": 8.68482332692113e-05, + "loss": 2.0286, + "step": 2608 + }, + { + "epoch": 0.7919259371680073, + "grad_norm": 0.39570608735084534, + "learning_rate": 8.68431710033411e-05, + "loss": 2.0292, + "step": 2609 + }, + { + "epoch": 0.7922294733646987, + "grad_norm": 0.39638906717300415, + "learning_rate": 8.683810873747089e-05, + "loss": 1.9292, + "step": 2610 + }, + { + "epoch": 0.7925330095613902, + "grad_norm": 0.40838631987571716, + "learning_rate": 8.683304647160068e-05, + "loss": 1.9439, + "step": 2611 + }, + { + "epoch": 0.7928365457580816, + "grad_norm": 0.41017046570777893, + "learning_rate": 8.682798420573048e-05, + "loss": 1.7575, + "step": 2612 + }, + { + "epoch": 0.7931400819547731, + "grad_norm": 0.38030532002449036, + "learning_rate": 8.682292193986029e-05, + "loss": 2.0398, + "step": 2613 + }, + { + "epoch": 0.7934436181514646, + "grad_norm": 0.42547357082366943, + "learning_rate": 8.681785967399008e-05, + "loss": 1.9265, + "step": 2614 + }, + { + "epoch": 0.793747154348156, + "grad_norm": 0.42651450634002686, + "learning_rate": 8.681279740811989e-05, + "loss": 1.8808, + "step": 2615 + }, + { + "epoch": 0.7940506905448474, + "grad_norm": 0.4874178469181061, + "learning_rate": 8.680773514224968e-05, + "loss": 1.8267, + "step": 2616 + }, + { + "epoch": 0.7943542267415389, + "grad_norm": 0.4573056101799011, + "learning_rate": 8.680267287637948e-05, + "loss": 2.0073, + "step": 2617 + }, + { + "epoch": 0.7946577629382304, + "grad_norm": 0.4408004879951477, + "learning_rate": 8.679761061050927e-05, + "loss": 2.1719, + "step": 2618 + }, + { + "epoch": 0.7949612991349219, + "grad_norm": 0.41363367438316345, + "learning_rate": 8.679254834463907e-05, + "loss": 2.0006, + "step": 2619 + }, + { + "epoch": 0.7952648353316133, + "grad_norm": 0.3256136178970337, + "learning_rate": 8.678748607876886e-05, + "loss": 1.5214, + "step": 2620 + }, + { + "epoch": 0.7955683715283047, + "grad_norm": 0.3597501516342163, + "learning_rate": 8.678242381289866e-05, + "loss": 1.9775, + "step": 2621 + }, + { + "epoch": 0.7958719077249962, + "grad_norm": 0.43128228187561035, + "learning_rate": 8.677736154702845e-05, + "loss": 1.3915, + "step": 2622 + }, + { + "epoch": 0.7961754439216877, + "grad_norm": 0.6114957332611084, + "learning_rate": 8.677229928115825e-05, + "loss": 2.0548, + "step": 2623 + }, + { + "epoch": 0.7964789801183791, + "grad_norm": 0.6381771564483643, + "learning_rate": 8.676723701528806e-05, + "loss": 2.1353, + "step": 2624 + }, + { + "epoch": 0.7967825163150706, + "grad_norm": 0.39409366250038147, + "learning_rate": 8.676217474941785e-05, + "loss": 0.9988, + "step": 2625 + }, + { + "epoch": 0.797086052511762, + "grad_norm": 0.4145677387714386, + "learning_rate": 8.675711248354765e-05, + "loss": 2.1825, + "step": 2626 + }, + { + "epoch": 0.7973895887084534, + "grad_norm": 0.40860435366630554, + "learning_rate": 8.675205021767744e-05, + "loss": 2.1545, + "step": 2627 + }, + { + "epoch": 0.797693124905145, + "grad_norm": 0.42259758710861206, + "learning_rate": 8.674698795180724e-05, + "loss": 2.0872, + "step": 2628 + }, + { + "epoch": 0.7979966611018364, + "grad_norm": 0.9106017351150513, + "learning_rate": 8.674192568593703e-05, + "loss": 1.8756, + "step": 2629 + }, + { + "epoch": 0.7983001972985279, + "grad_norm": 0.4160531163215637, + "learning_rate": 8.673686342006683e-05, + "loss": 1.6454, + "step": 2630 + }, + { + "epoch": 0.7986037334952193, + "grad_norm": 0.4564226269721985, + "learning_rate": 8.673180115419662e-05, + "loss": 1.6036, + "step": 2631 + }, + { + "epoch": 0.7989072696919107, + "grad_norm": 0.5077611207962036, + "learning_rate": 8.672673888832641e-05, + "loss": 1.8217, + "step": 2632 + }, + { + "epoch": 0.7992108058886022, + "grad_norm": 0.3732128143310547, + "learning_rate": 8.672167662245621e-05, + "loss": 1.6299, + "step": 2633 + }, + { + "epoch": 0.7995143420852937, + "grad_norm": 0.4433646500110626, + "learning_rate": 8.671661435658602e-05, + "loss": 2.0876, + "step": 2634 + }, + { + "epoch": 0.7998178782819851, + "grad_norm": 0.3869750201702118, + "learning_rate": 8.671155209071581e-05, + "loss": 1.9312, + "step": 2635 + }, + { + "epoch": 0.8001214144786766, + "grad_norm": 0.3622623682022095, + "learning_rate": 8.670648982484561e-05, + "loss": 1.3211, + "step": 2636 + }, + { + "epoch": 0.800424950675368, + "grad_norm": 0.38390904664993286, + "learning_rate": 8.67014275589754e-05, + "loss": 2.0186, + "step": 2637 + }, + { + "epoch": 0.8007284868720594, + "grad_norm": 0.5641773343086243, + "learning_rate": 8.66963652931052e-05, + "loss": 1.5207, + "step": 2638 + }, + { + "epoch": 0.801032023068751, + "grad_norm": 0.399679571390152, + "learning_rate": 8.669130302723499e-05, + "loss": 1.621, + "step": 2639 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.38951337337493896, + "learning_rate": 8.668624076136479e-05, + "loss": 1.8445, + "step": 2640 + }, + { + "epoch": 0.8016390954621339, + "grad_norm": 0.47294196486473083, + "learning_rate": 8.668117849549458e-05, + "loss": 1.4963, + "step": 2641 + }, + { + "epoch": 0.8019426316588253, + "grad_norm": 0.4139798581600189, + "learning_rate": 8.667611622962438e-05, + "loss": 1.863, + "step": 2642 + }, + { + "epoch": 0.8022461678555167, + "grad_norm": 0.42392146587371826, + "learning_rate": 8.667105396375418e-05, + "loss": 1.4778, + "step": 2643 + }, + { + "epoch": 0.8025497040522083, + "grad_norm": 0.4208984673023224, + "learning_rate": 8.666599169788398e-05, + "loss": 1.8756, + "step": 2644 + }, + { + "epoch": 0.8028532402488997, + "grad_norm": 0.3515019714832306, + "learning_rate": 8.666092943201377e-05, + "loss": 1.8822, + "step": 2645 + }, + { + "epoch": 0.8031567764455911, + "grad_norm": 0.4780139923095703, + "learning_rate": 8.665586716614357e-05, + "loss": 1.7862, + "step": 2646 + }, + { + "epoch": 0.8034603126422826, + "grad_norm": 0.4419991970062256, + "learning_rate": 8.665080490027336e-05, + "loss": 2.0666, + "step": 2647 + }, + { + "epoch": 0.803763848838974, + "grad_norm": 0.43830469250679016, + "learning_rate": 8.664574263440316e-05, + "loss": 2.2725, + "step": 2648 + }, + { + "epoch": 0.8040673850356655, + "grad_norm": 0.3989112377166748, + "learning_rate": 8.664068036853295e-05, + "loss": 1.9306, + "step": 2649 + }, + { + "epoch": 0.804370921232357, + "grad_norm": 0.3216220438480377, + "learning_rate": 8.663561810266275e-05, + "loss": 1.862, + "step": 2650 + }, + { + "epoch": 0.8046744574290484, + "grad_norm": 0.4546568989753723, + "learning_rate": 8.663055583679254e-05, + "loss": 2.0723, + "step": 2651 + }, + { + "epoch": 0.8049779936257399, + "grad_norm": 0.39314141869544983, + "learning_rate": 8.662549357092235e-05, + "loss": 1.7299, + "step": 2652 + }, + { + "epoch": 0.8052815298224313, + "grad_norm": 0.4112257957458496, + "learning_rate": 8.662043130505215e-05, + "loss": 1.2306, + "step": 2653 + }, + { + "epoch": 0.8055850660191228, + "grad_norm": 0.8900299072265625, + "learning_rate": 8.661536903918194e-05, + "loss": 2.0176, + "step": 2654 + }, + { + "epoch": 0.8058886022158143, + "grad_norm": 0.35671380162239075, + "learning_rate": 8.661030677331174e-05, + "loss": 1.521, + "step": 2655 + }, + { + "epoch": 0.8061921384125057, + "grad_norm": 0.3438098430633545, + "learning_rate": 8.660524450744153e-05, + "loss": 1.5401, + "step": 2656 + }, + { + "epoch": 0.8064956746091971, + "grad_norm": 0.4241732656955719, + "learning_rate": 8.660018224157133e-05, + "loss": 1.5139, + "step": 2657 + }, + { + "epoch": 0.8067992108058886, + "grad_norm": 0.41691192984580994, + "learning_rate": 8.659511997570113e-05, + "loss": 1.9927, + "step": 2658 + }, + { + "epoch": 0.8071027470025801, + "grad_norm": 0.36074796319007874, + "learning_rate": 8.659005770983093e-05, + "loss": 1.3125, + "step": 2659 + }, + { + "epoch": 0.8074062831992715, + "grad_norm": 0.503271222114563, + "learning_rate": 8.658499544396072e-05, + "loss": 2.0934, + "step": 2660 + }, + { + "epoch": 0.807709819395963, + "grad_norm": 0.47022250294685364, + "learning_rate": 8.657993317809052e-05, + "loss": 1.515, + "step": 2661 + }, + { + "epoch": 0.8080133555926544, + "grad_norm": 0.5267159938812256, + "learning_rate": 8.657487091222031e-05, + "loss": 1.7448, + "step": 2662 + }, + { + "epoch": 0.8083168917893458, + "grad_norm": 0.5382044315338135, + "learning_rate": 8.656980864635012e-05, + "loss": 1.6507, + "step": 2663 + }, + { + "epoch": 0.8086204279860373, + "grad_norm": 0.5040610432624817, + "learning_rate": 8.656474638047992e-05, + "loss": 1.3789, + "step": 2664 + }, + { + "epoch": 0.8089239641827288, + "grad_norm": 0.356317400932312, + "learning_rate": 8.655968411460971e-05, + "loss": 1.8226, + "step": 2665 + }, + { + "epoch": 0.8092275003794203, + "grad_norm": 0.38693082332611084, + "learning_rate": 8.65546218487395e-05, + "loss": 1.8358, + "step": 2666 + }, + { + "epoch": 0.8095310365761117, + "grad_norm": 0.42606496810913086, + "learning_rate": 8.65495595828693e-05, + "loss": 1.9002, + "step": 2667 + }, + { + "epoch": 0.8098345727728031, + "grad_norm": 0.3855800926685333, + "learning_rate": 8.65444973169991e-05, + "loss": 1.8488, + "step": 2668 + }, + { + "epoch": 0.8101381089694946, + "grad_norm": 0.46677157282829285, + "learning_rate": 8.653943505112889e-05, + "loss": 2.264, + "step": 2669 + }, + { + "epoch": 0.8104416451661861, + "grad_norm": 0.3479576110839844, + "learning_rate": 8.653437278525868e-05, + "loss": 1.796, + "step": 2670 + }, + { + "epoch": 0.8107451813628775, + "grad_norm": 0.4703936278820038, + "learning_rate": 8.652931051938848e-05, + "loss": 2.0457, + "step": 2671 + }, + { + "epoch": 0.811048717559569, + "grad_norm": 0.3478047847747803, + "learning_rate": 8.652424825351827e-05, + "loss": 1.8033, + "step": 2672 + }, + { + "epoch": 0.8113522537562604, + "grad_norm": 0.4196014702320099, + "learning_rate": 8.651918598764808e-05, + "loss": 1.9513, + "step": 2673 + }, + { + "epoch": 0.8116557899529518, + "grad_norm": 0.36813899874687195, + "learning_rate": 8.651412372177788e-05, + "loss": 1.9776, + "step": 2674 + }, + { + "epoch": 0.8119593261496434, + "grad_norm": 0.44413039088249207, + "learning_rate": 8.650906145590767e-05, + "loss": 1.9041, + "step": 2675 + }, + { + "epoch": 0.8122628623463348, + "grad_norm": 0.4073639512062073, + "learning_rate": 8.650399919003747e-05, + "loss": 1.9749, + "step": 2676 + }, + { + "epoch": 0.8125663985430263, + "grad_norm": 0.3961658775806427, + "learning_rate": 8.649893692416726e-05, + "loss": 1.8646, + "step": 2677 + }, + { + "epoch": 0.8128699347397177, + "grad_norm": 0.536353349685669, + "learning_rate": 8.649387465829706e-05, + "loss": 1.7772, + "step": 2678 + }, + { + "epoch": 0.8131734709364091, + "grad_norm": 0.4030105471611023, + "learning_rate": 8.648881239242685e-05, + "loss": 2.1668, + "step": 2679 + }, + { + "epoch": 0.8134770071331007, + "grad_norm": 0.4185904264450073, + "learning_rate": 8.648375012655665e-05, + "loss": 1.877, + "step": 2680 + }, + { + "epoch": 0.8137805433297921, + "grad_norm": 0.4530700445175171, + "learning_rate": 8.647868786068644e-05, + "loss": 1.7669, + "step": 2681 + }, + { + "epoch": 0.8140840795264835, + "grad_norm": 0.7239555716514587, + "learning_rate": 8.647362559481625e-05, + "loss": 1.7399, + "step": 2682 + }, + { + "epoch": 0.814387615723175, + "grad_norm": 0.5373411178588867, + "learning_rate": 8.646856332894604e-05, + "loss": 1.3157, + "step": 2683 + }, + { + "epoch": 0.8146911519198664, + "grad_norm": 0.41730010509490967, + "learning_rate": 8.646350106307584e-05, + "loss": 1.9912, + "step": 2684 + }, + { + "epoch": 0.814994688116558, + "grad_norm": 0.4486635625362396, + "learning_rate": 8.645843879720563e-05, + "loss": 1.9651, + "step": 2685 + }, + { + "epoch": 0.8152982243132494, + "grad_norm": 0.5648373961448669, + "learning_rate": 8.645337653133543e-05, + "loss": 1.7917, + "step": 2686 + }, + { + "epoch": 0.8156017605099408, + "grad_norm": 0.4445558488368988, + "learning_rate": 8.644831426546522e-05, + "loss": 1.71, + "step": 2687 + }, + { + "epoch": 0.8159052967066323, + "grad_norm": 0.4051614999771118, + "learning_rate": 8.644325199959502e-05, + "loss": 2.0976, + "step": 2688 + }, + { + "epoch": 0.8162088329033237, + "grad_norm": 0.41357868909835815, + "learning_rate": 8.643818973372481e-05, + "loss": 1.8658, + "step": 2689 + }, + { + "epoch": 0.8165123691000151, + "grad_norm": 0.43194282054901123, + "learning_rate": 8.643312746785461e-05, + "loss": 1.3586, + "step": 2690 + }, + { + "epoch": 0.8168159052967067, + "grad_norm": 0.4093270003795624, + "learning_rate": 8.642806520198442e-05, + "loss": 1.8946, + "step": 2691 + }, + { + "epoch": 0.8171194414933981, + "grad_norm": 0.5622807741165161, + "learning_rate": 8.642300293611421e-05, + "loss": 1.9175, + "step": 2692 + }, + { + "epoch": 0.8174229776900895, + "grad_norm": 0.41735681891441345, + "learning_rate": 8.6417940670244e-05, + "loss": 2.0407, + "step": 2693 + }, + { + "epoch": 0.817726513886781, + "grad_norm": 0.4518575966358185, + "learning_rate": 8.64128784043738e-05, + "loss": 1.8137, + "step": 2694 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.34537529945373535, + "learning_rate": 8.64078161385036e-05, + "loss": 1.7952, + "step": 2695 + }, + { + "epoch": 0.818333586280164, + "grad_norm": 0.44123801589012146, + "learning_rate": 8.640275387263339e-05, + "loss": 1.74, + "step": 2696 + }, + { + "epoch": 0.8186371224768554, + "grad_norm": 0.36231714487075806, + "learning_rate": 8.639769160676318e-05, + "loss": 1.9205, + "step": 2697 + }, + { + "epoch": 0.8189406586735468, + "grad_norm": 0.4573259651660919, + "learning_rate": 8.639262934089298e-05, + "loss": 1.8026, + "step": 2698 + }, + { + "epoch": 0.8192441948702383, + "grad_norm": 0.43909579515457153, + "learning_rate": 8.638756707502277e-05, + "loss": 1.9559, + "step": 2699 + }, + { + "epoch": 0.8195477310669297, + "grad_norm": 0.4051404893398285, + "learning_rate": 8.638250480915257e-05, + "loss": 1.9962, + "step": 2700 + }, + { + "epoch": 0.8198512672636212, + "grad_norm": 0.41793152689933777, + "learning_rate": 8.637744254328238e-05, + "loss": 1.7749, + "step": 2701 + }, + { + "epoch": 0.8201548034603127, + "grad_norm": 0.5424450039863586, + "learning_rate": 8.637238027741219e-05, + "loss": 1.8725, + "step": 2702 + }, + { + "epoch": 0.8204583396570041, + "grad_norm": 0.39918193221092224, + "learning_rate": 8.636731801154198e-05, + "loss": 2.0224, + "step": 2703 + }, + { + "epoch": 0.8207618758536955, + "grad_norm": 0.40323561429977417, + "learning_rate": 8.636225574567178e-05, + "loss": 1.9338, + "step": 2704 + }, + { + "epoch": 0.821065412050387, + "grad_norm": 0.40484192967414856, + "learning_rate": 8.635719347980157e-05, + "loss": 1.8665, + "step": 2705 + }, + { + "epoch": 0.8213689482470785, + "grad_norm": 0.45353245735168457, + "learning_rate": 8.635213121393137e-05, + "loss": 2.0364, + "step": 2706 + }, + { + "epoch": 0.82167248444377, + "grad_norm": 0.5369464755058289, + "learning_rate": 8.634706894806116e-05, + "loss": 1.3729, + "step": 2707 + }, + { + "epoch": 0.8219760206404614, + "grad_norm": 0.3175603449344635, + "learning_rate": 8.634200668219095e-05, + "loss": 1.6742, + "step": 2708 + }, + { + "epoch": 0.8222795568371528, + "grad_norm": 0.4314495623111725, + "learning_rate": 8.633694441632075e-05, + "loss": 1.9536, + "step": 2709 + }, + { + "epoch": 0.8225830930338442, + "grad_norm": 0.4610050916671753, + "learning_rate": 8.633188215045054e-05, + "loss": 2.0608, + "step": 2710 + }, + { + "epoch": 0.8228866292305358, + "grad_norm": 0.3542473018169403, + "learning_rate": 8.632681988458034e-05, + "loss": 1.5889, + "step": 2711 + }, + { + "epoch": 0.8231901654272272, + "grad_norm": 0.4445483684539795, + "learning_rate": 8.632175761871015e-05, + "loss": 2.0438, + "step": 2712 + }, + { + "epoch": 0.8234937016239187, + "grad_norm": 0.42590487003326416, + "learning_rate": 8.631669535283994e-05, + "loss": 1.6821, + "step": 2713 + }, + { + "epoch": 0.8237972378206101, + "grad_norm": 0.3951219618320465, + "learning_rate": 8.631163308696974e-05, + "loss": 1.8549, + "step": 2714 + }, + { + "epoch": 0.8241007740173015, + "grad_norm": 0.4422662556171417, + "learning_rate": 8.630657082109953e-05, + "loss": 1.0172, + "step": 2715 + }, + { + "epoch": 0.824404310213993, + "grad_norm": 0.6093502640724182, + "learning_rate": 8.630150855522933e-05, + "loss": 1.469, + "step": 2716 + }, + { + "epoch": 0.8247078464106845, + "grad_norm": 0.6702497005462646, + "learning_rate": 8.629644628935912e-05, + "loss": 1.7706, + "step": 2717 + }, + { + "epoch": 0.8250113826073759, + "grad_norm": 0.4154108166694641, + "learning_rate": 8.629138402348892e-05, + "loss": 2.0392, + "step": 2718 + }, + { + "epoch": 0.8253149188040674, + "grad_norm": 0.4183025062084198, + "learning_rate": 8.628632175761871e-05, + "loss": 1.648, + "step": 2719 + }, + { + "epoch": 0.8256184550007588, + "grad_norm": 0.40831395983695984, + "learning_rate": 8.62812594917485e-05, + "loss": 1.8159, + "step": 2720 + }, + { + "epoch": 0.8259219911974502, + "grad_norm": 0.3942376673221588, + "learning_rate": 8.627619722587831e-05, + "loss": 2.1595, + "step": 2721 + }, + { + "epoch": 0.8262255273941418, + "grad_norm": 0.4016304314136505, + "learning_rate": 8.627113496000811e-05, + "loss": 1.9915, + "step": 2722 + }, + { + "epoch": 0.8265290635908332, + "grad_norm": 0.43526315689086914, + "learning_rate": 8.62660726941379e-05, + "loss": 2.009, + "step": 2723 + }, + { + "epoch": 0.8268325997875247, + "grad_norm": 0.4215218424797058, + "learning_rate": 8.62610104282677e-05, + "loss": 1.6511, + "step": 2724 + }, + { + "epoch": 0.8271361359842161, + "grad_norm": 0.38574057817459106, + "learning_rate": 8.62559481623975e-05, + "loss": 1.6658, + "step": 2725 + }, + { + "epoch": 0.8274396721809075, + "grad_norm": 0.4340943694114685, + "learning_rate": 8.625088589652729e-05, + "loss": 1.9324, + "step": 2726 + }, + { + "epoch": 0.8277432083775991, + "grad_norm": 0.474386066198349, + "learning_rate": 8.624582363065708e-05, + "loss": 1.7866, + "step": 2727 + }, + { + "epoch": 0.8280467445742905, + "grad_norm": 0.4177556335926056, + "learning_rate": 8.624076136478688e-05, + "loss": 1.4593, + "step": 2728 + }, + { + "epoch": 0.8283502807709819, + "grad_norm": 0.412219375371933, + "learning_rate": 8.623569909891667e-05, + "loss": 1.592, + "step": 2729 + }, + { + "epoch": 0.8286538169676734, + "grad_norm": 0.5068672895431519, + "learning_rate": 8.623063683304648e-05, + "loss": 2.0902, + "step": 2730 + }, + { + "epoch": 0.8289573531643648, + "grad_norm": 0.43211403489112854, + "learning_rate": 8.622557456717628e-05, + "loss": 1.1154, + "step": 2731 + }, + { + "epoch": 0.8292608893610564, + "grad_norm": 0.37955889105796814, + "learning_rate": 8.622051230130607e-05, + "loss": 1.9858, + "step": 2732 + }, + { + "epoch": 0.8295644255577478, + "grad_norm": 0.4571453928947449, + "learning_rate": 8.621545003543587e-05, + "loss": 1.6966, + "step": 2733 + }, + { + "epoch": 0.8298679617544392, + "grad_norm": 0.43323561549186707, + "learning_rate": 8.621038776956566e-05, + "loss": 2.0712, + "step": 2734 + }, + { + "epoch": 0.8301714979511307, + "grad_norm": 0.4051443338394165, + "learning_rate": 8.620532550369545e-05, + "loss": 1.9132, + "step": 2735 + }, + { + "epoch": 0.8304750341478221, + "grad_norm": 0.4727547764778137, + "learning_rate": 8.620026323782525e-05, + "loss": 2.0754, + "step": 2736 + }, + { + "epoch": 0.8307785703445136, + "grad_norm": 0.8304808139801025, + "learning_rate": 8.619520097195504e-05, + "loss": 1.3452, + "step": 2737 + }, + { + "epoch": 0.8310821065412051, + "grad_norm": 0.40300288796424866, + "learning_rate": 8.619013870608484e-05, + "loss": 1.903, + "step": 2738 + }, + { + "epoch": 0.8313856427378965, + "grad_norm": 0.4302805960178375, + "learning_rate": 8.618507644021463e-05, + "loss": 1.9938, + "step": 2739 + }, + { + "epoch": 0.8316891789345879, + "grad_norm": 0.41586950421333313, + "learning_rate": 8.618001417434444e-05, + "loss": 1.8044, + "step": 2740 + }, + { + "epoch": 0.8319927151312794, + "grad_norm": 0.4185795187950134, + "learning_rate": 8.617495190847424e-05, + "loss": 2.0513, + "step": 2741 + }, + { + "epoch": 0.8322962513279709, + "grad_norm": 0.4664061367511749, + "learning_rate": 8.616988964260403e-05, + "loss": 1.6634, + "step": 2742 + }, + { + "epoch": 0.8325997875246623, + "grad_norm": 0.44080016016960144, + "learning_rate": 8.616482737673383e-05, + "loss": 1.854, + "step": 2743 + }, + { + "epoch": 0.8329033237213538, + "grad_norm": 0.4284375011920929, + "learning_rate": 8.615976511086362e-05, + "loss": 1.9713, + "step": 2744 + }, + { + "epoch": 0.8332068599180452, + "grad_norm": 0.42498892545700073, + "learning_rate": 8.615470284499342e-05, + "loss": 1.5883, + "step": 2745 + }, + { + "epoch": 0.8335103961147367, + "grad_norm": 0.5301217436790466, + "learning_rate": 8.614964057912321e-05, + "loss": 0.8439, + "step": 2746 + }, + { + "epoch": 0.8338139323114281, + "grad_norm": 0.5539612174034119, + "learning_rate": 8.614457831325302e-05, + "loss": 2.1118, + "step": 2747 + }, + { + "epoch": 0.8341174685081196, + "grad_norm": 0.47817254066467285, + "learning_rate": 8.613951604738281e-05, + "loss": 2.1536, + "step": 2748 + }, + { + "epoch": 0.8344210047048111, + "grad_norm": 0.3291810154914856, + "learning_rate": 8.613445378151261e-05, + "loss": 1.951, + "step": 2749 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.365369975566864, + "learning_rate": 8.61293915156424e-05, + "loss": 1.4365, + "step": 2750 + }, + { + "epoch": 0.8350280770981939, + "grad_norm": 0.4028696119785309, + "learning_rate": 8.612432924977221e-05, + "loss": 2.2443, + "step": 2751 + }, + { + "epoch": 0.8353316132948854, + "grad_norm": 0.40695205330848694, + "learning_rate": 8.611926698390201e-05, + "loss": 2.0337, + "step": 2752 + }, + { + "epoch": 0.8356351494915769, + "grad_norm": 1.1030051708221436, + "learning_rate": 8.61142047180318e-05, + "loss": 1.9415, + "step": 2753 + }, + { + "epoch": 0.8359386856882683, + "grad_norm": 0.36230286955833435, + "learning_rate": 8.61091424521616e-05, + "loss": 1.7159, + "step": 2754 + }, + { + "epoch": 0.8362422218849598, + "grad_norm": 0.36546850204467773, + "learning_rate": 8.610408018629139e-05, + "loss": 1.7412, + "step": 2755 + }, + { + "epoch": 0.8365457580816512, + "grad_norm": 0.3433781564235687, + "learning_rate": 8.609901792042119e-05, + "loss": 1.6014, + "step": 2756 + }, + { + "epoch": 0.8368492942783426, + "grad_norm": 0.4202607274055481, + "learning_rate": 8.609395565455098e-05, + "loss": 1.8733, + "step": 2757 + }, + { + "epoch": 0.8371528304750342, + "grad_norm": 0.41112053394317627, + "learning_rate": 8.608889338868078e-05, + "loss": 1.8103, + "step": 2758 + }, + { + "epoch": 0.8374563666717256, + "grad_norm": 0.44569066166877747, + "learning_rate": 8.608383112281057e-05, + "loss": 1.3989, + "step": 2759 + }, + { + "epoch": 0.8377599028684171, + "grad_norm": 0.4139041602611542, + "learning_rate": 8.607876885694038e-05, + "loss": 1.8406, + "step": 2760 + }, + { + "epoch": 0.8380634390651085, + "grad_norm": 0.40701228380203247, + "learning_rate": 8.607370659107017e-05, + "loss": 2.1573, + "step": 2761 + }, + { + "epoch": 0.8383669752617999, + "grad_norm": 0.44878578186035156, + "learning_rate": 8.606864432519997e-05, + "loss": 1.5094, + "step": 2762 + }, + { + "epoch": 0.8386705114584915, + "grad_norm": 0.5021698474884033, + "learning_rate": 8.606358205932976e-05, + "loss": 2.1396, + "step": 2763 + }, + { + "epoch": 0.8389740476551829, + "grad_norm": 0.5416968464851379, + "learning_rate": 8.605851979345956e-05, + "loss": 1.974, + "step": 2764 + }, + { + "epoch": 0.8392775838518743, + "grad_norm": 0.5953572988510132, + "learning_rate": 8.605345752758935e-05, + "loss": 1.7463, + "step": 2765 + }, + { + "epoch": 0.8395811200485658, + "grad_norm": 0.43414852023124695, + "learning_rate": 8.604839526171915e-05, + "loss": 1.7121, + "step": 2766 + }, + { + "epoch": 0.8398846562452572, + "grad_norm": 0.4000817537307739, + "learning_rate": 8.604333299584894e-05, + "loss": 2.1051, + "step": 2767 + }, + { + "epoch": 0.8401881924419488, + "grad_norm": 0.6544987559318542, + "learning_rate": 8.603827072997874e-05, + "loss": 2.063, + "step": 2768 + }, + { + "epoch": 0.8404917286386402, + "grad_norm": 0.7102285623550415, + "learning_rate": 8.603320846410855e-05, + "loss": 2.2053, + "step": 2769 + }, + { + "epoch": 0.8407952648353316, + "grad_norm": 0.39218565821647644, + "learning_rate": 8.602814619823834e-05, + "loss": 1.2981, + "step": 2770 + }, + { + "epoch": 0.8410988010320231, + "grad_norm": 0.42591944336891174, + "learning_rate": 8.602308393236814e-05, + "loss": 2.0123, + "step": 2771 + }, + { + "epoch": 0.8414023372287145, + "grad_norm": 0.3958960175514221, + "learning_rate": 8.601802166649793e-05, + "loss": 2.1612, + "step": 2772 + }, + { + "epoch": 0.8417058734254059, + "grad_norm": 0.4331991672515869, + "learning_rate": 8.601295940062772e-05, + "loss": 1.8874, + "step": 2773 + }, + { + "epoch": 0.8420094096220975, + "grad_norm": 0.42542657256126404, + "learning_rate": 8.600789713475752e-05, + "loss": 1.3892, + "step": 2774 + }, + { + "epoch": 0.8423129458187889, + "grad_norm": 0.3944099545478821, + "learning_rate": 8.600283486888731e-05, + "loss": 1.8069, + "step": 2775 + }, + { + "epoch": 0.8426164820154803, + "grad_norm": 0.42040184140205383, + "learning_rate": 8.599777260301711e-05, + "loss": 1.8774, + "step": 2776 + }, + { + "epoch": 0.8429200182121718, + "grad_norm": 0.4341401755809784, + "learning_rate": 8.59927103371469e-05, + "loss": 1.9773, + "step": 2777 + }, + { + "epoch": 0.8432235544088632, + "grad_norm": 0.4372880458831787, + "learning_rate": 8.59876480712767e-05, + "loss": 2.0873, + "step": 2778 + }, + { + "epoch": 0.8435270906055548, + "grad_norm": 0.3675346374511719, + "learning_rate": 8.598258580540651e-05, + "loss": 1.1921, + "step": 2779 + }, + { + "epoch": 0.8438306268022462, + "grad_norm": 0.39204880595207214, + "learning_rate": 8.59775235395363e-05, + "loss": 1.9907, + "step": 2780 + }, + { + "epoch": 0.8441341629989376, + "grad_norm": 0.4221871793270111, + "learning_rate": 8.59724612736661e-05, + "loss": 1.7039, + "step": 2781 + }, + { + "epoch": 0.844437699195629, + "grad_norm": 0.4046156406402588, + "learning_rate": 8.596739900779589e-05, + "loss": 2.1068, + "step": 2782 + }, + { + "epoch": 0.8447412353923205, + "grad_norm": 0.42431220412254333, + "learning_rate": 8.596233674192569e-05, + "loss": 1.7547, + "step": 2783 + }, + { + "epoch": 0.845044771589012, + "grad_norm": 0.33057859539985657, + "learning_rate": 8.595727447605548e-05, + "loss": 1.6987, + "step": 2784 + }, + { + "epoch": 0.8453483077857035, + "grad_norm": 0.40820497274398804, + "learning_rate": 8.595221221018528e-05, + "loss": 1.6101, + "step": 2785 + }, + { + "epoch": 0.8456518439823949, + "grad_norm": 0.6457285284996033, + "learning_rate": 8.594714994431507e-05, + "loss": 1.927, + "step": 2786 + }, + { + "epoch": 0.8459553801790863, + "grad_norm": 0.4055453836917877, + "learning_rate": 8.594208767844487e-05, + "loss": 1.9902, + "step": 2787 + }, + { + "epoch": 0.8462589163757778, + "grad_norm": 0.45660969614982605, + "learning_rate": 8.593702541257467e-05, + "loss": 1.8487, + "step": 2788 + }, + { + "epoch": 0.8465624525724693, + "grad_norm": 0.4082806408405304, + "learning_rate": 8.593196314670447e-05, + "loss": 1.7797, + "step": 2789 + }, + { + "epoch": 0.8468659887691607, + "grad_norm": 0.39490821957588196, + "learning_rate": 8.592690088083426e-05, + "loss": 1.9501, + "step": 2790 + }, + { + "epoch": 0.8471695249658522, + "grad_norm": 0.47634264826774597, + "learning_rate": 8.592183861496407e-05, + "loss": 1.5822, + "step": 2791 + }, + { + "epoch": 0.8474730611625436, + "grad_norm": 0.4166494607925415, + "learning_rate": 8.591677634909387e-05, + "loss": 2.0393, + "step": 2792 + }, + { + "epoch": 0.847776597359235, + "grad_norm": 0.3837972581386566, + "learning_rate": 8.591171408322366e-05, + "loss": 1.8317, + "step": 2793 + }, + { + "epoch": 0.8480801335559266, + "grad_norm": 0.3955104947090149, + "learning_rate": 8.590665181735346e-05, + "loss": 2.0546, + "step": 2794 + }, + { + "epoch": 0.848383669752618, + "grad_norm": 0.35945233702659607, + "learning_rate": 8.590158955148325e-05, + "loss": 2.0985, + "step": 2795 + }, + { + "epoch": 0.8486872059493095, + "grad_norm": 0.5097954869270325, + "learning_rate": 8.589652728561305e-05, + "loss": 1.6378, + "step": 2796 + }, + { + "epoch": 0.8489907421460009, + "grad_norm": 0.37827685475349426, + "learning_rate": 8.589146501974284e-05, + "loss": 1.9986, + "step": 2797 + }, + { + "epoch": 0.8492942783426923, + "grad_norm": 0.39725548028945923, + "learning_rate": 8.588640275387264e-05, + "loss": 1.9275, + "step": 2798 + }, + { + "epoch": 0.8495978145393838, + "grad_norm": 0.3660275936126709, + "learning_rate": 8.588134048800244e-05, + "loss": 1.7952, + "step": 2799 + }, + { + "epoch": 0.8499013507360753, + "grad_norm": 0.7100840210914612, + "learning_rate": 8.587627822213224e-05, + "loss": 1.8986, + "step": 2800 + }, + { + "epoch": 0.8502048869327667, + "grad_norm": 0.4502932131290436, + "learning_rate": 8.587121595626203e-05, + "loss": 1.8977, + "step": 2801 + }, + { + "epoch": 0.8505084231294582, + "grad_norm": 0.3382154107093811, + "learning_rate": 8.586615369039183e-05, + "loss": 1.8838, + "step": 2802 + }, + { + "epoch": 0.8508119593261496, + "grad_norm": 0.42528000473976135, + "learning_rate": 8.586109142452162e-05, + "loss": 2.4048, + "step": 2803 + }, + { + "epoch": 0.851115495522841, + "grad_norm": 0.41296571493148804, + "learning_rate": 8.585602915865142e-05, + "loss": 1.5728, + "step": 2804 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.4023008644580841, + "learning_rate": 8.585096689278121e-05, + "loss": 1.9834, + "step": 2805 + }, + { + "epoch": 0.851722567916224, + "grad_norm": 0.34381431341171265, + "learning_rate": 8.584590462691101e-05, + "loss": 2.0097, + "step": 2806 + }, + { + "epoch": 0.8520261041129155, + "grad_norm": 0.3973744809627533, + "learning_rate": 8.58408423610408e-05, + "loss": 2.008, + "step": 2807 + }, + { + "epoch": 0.8523296403096069, + "grad_norm": 0.40327900648117065, + "learning_rate": 8.583578009517061e-05, + "loss": 1.9841, + "step": 2808 + }, + { + "epoch": 0.8526331765062983, + "grad_norm": 0.3598316013813019, + "learning_rate": 8.58307178293004e-05, + "loss": 1.9542, + "step": 2809 + }, + { + "epoch": 0.8529367127029899, + "grad_norm": 0.4372098743915558, + "learning_rate": 8.58256555634302e-05, + "loss": 1.7054, + "step": 2810 + }, + { + "epoch": 0.8532402488996813, + "grad_norm": 0.39820167422294617, + "learning_rate": 8.582059329756e-05, + "loss": 1.6737, + "step": 2811 + }, + { + "epoch": 0.8535437850963727, + "grad_norm": 0.45620962977409363, + "learning_rate": 8.581553103168979e-05, + "loss": 1.895, + "step": 2812 + }, + { + "epoch": 0.8538473212930642, + "grad_norm": 0.4112420678138733, + "learning_rate": 8.581046876581958e-05, + "loss": 2.0469, + "step": 2813 + }, + { + "epoch": 0.8541508574897556, + "grad_norm": 0.4265507161617279, + "learning_rate": 8.580540649994938e-05, + "loss": 1.8141, + "step": 2814 + }, + { + "epoch": 0.8544543936864472, + "grad_norm": 0.9317876696586609, + "learning_rate": 8.580034423407917e-05, + "loss": 1.9912, + "step": 2815 + }, + { + "epoch": 0.8547579298831386, + "grad_norm": 0.41293710470199585, + "learning_rate": 8.579528196820897e-05, + "loss": 2.0593, + "step": 2816 + }, + { + "epoch": 0.85506146607983, + "grad_norm": 0.6074060201644897, + "learning_rate": 8.579021970233876e-05, + "loss": 1.7091, + "step": 2817 + }, + { + "epoch": 0.8553650022765215, + "grad_norm": 0.39665672183036804, + "learning_rate": 8.578515743646857e-05, + "loss": 1.5974, + "step": 2818 + }, + { + "epoch": 0.8556685384732129, + "grad_norm": 0.34235861897468567, + "learning_rate": 8.578009517059837e-05, + "loss": 1.7635, + "step": 2819 + }, + { + "epoch": 0.8559720746699044, + "grad_norm": 0.416742742061615, + "learning_rate": 8.577503290472816e-05, + "loss": 1.6473, + "step": 2820 + }, + { + "epoch": 0.8562756108665959, + "grad_norm": 0.41152289509773254, + "learning_rate": 8.576997063885796e-05, + "loss": 1.8142, + "step": 2821 + }, + { + "epoch": 0.8565791470632873, + "grad_norm": 0.44638922810554504, + "learning_rate": 8.576490837298775e-05, + "loss": 1.5819, + "step": 2822 + }, + { + "epoch": 0.8568826832599787, + "grad_norm": 0.38064852356910706, + "learning_rate": 8.575984610711755e-05, + "loss": 1.4263, + "step": 2823 + }, + { + "epoch": 0.8571862194566702, + "grad_norm": 0.41755181550979614, + "learning_rate": 8.575478384124734e-05, + "loss": 1.5668, + "step": 2824 + }, + { + "epoch": 0.8574897556533617, + "grad_norm": 0.45153340697288513, + "learning_rate": 8.574972157537714e-05, + "loss": 1.6601, + "step": 2825 + }, + { + "epoch": 0.8577932918500532, + "grad_norm": 0.3700641989707947, + "learning_rate": 8.574465930950693e-05, + "loss": 1.3577, + "step": 2826 + }, + { + "epoch": 0.8580968280467446, + "grad_norm": 0.44846633076667786, + "learning_rate": 8.573959704363674e-05, + "loss": 2.0195, + "step": 2827 + }, + { + "epoch": 0.858400364243436, + "grad_norm": 0.4378660023212433, + "learning_rate": 8.573453477776653e-05, + "loss": 1.7914, + "step": 2828 + }, + { + "epoch": 0.8587039004401275, + "grad_norm": 0.4284498691558838, + "learning_rate": 8.572947251189633e-05, + "loss": 1.8443, + "step": 2829 + }, + { + "epoch": 0.8590074366368189, + "grad_norm": 0.4318150579929352, + "learning_rate": 8.572441024602612e-05, + "loss": 1.7671, + "step": 2830 + }, + { + "epoch": 0.8593109728335104, + "grad_norm": 1.0121327638626099, + "learning_rate": 8.571934798015592e-05, + "loss": 1.9886, + "step": 2831 + }, + { + "epoch": 0.8596145090302019, + "grad_norm": 0.4319281578063965, + "learning_rate": 8.571428571428571e-05, + "loss": 1.8299, + "step": 2832 + }, + { + "epoch": 0.8599180452268933, + "grad_norm": 0.42897358536720276, + "learning_rate": 8.570922344841551e-05, + "loss": 2.0147, + "step": 2833 + }, + { + "epoch": 0.8602215814235847, + "grad_norm": 0.39335522055625916, + "learning_rate": 8.57041611825453e-05, + "loss": 1.5326, + "step": 2834 + }, + { + "epoch": 0.8605251176202762, + "grad_norm": 1.2661360502243042, + "learning_rate": 8.56990989166751e-05, + "loss": 2.1234, + "step": 2835 + }, + { + "epoch": 0.8608286538169677, + "grad_norm": 0.7632877230644226, + "learning_rate": 8.56940366508049e-05, + "loss": 1.8358, + "step": 2836 + }, + { + "epoch": 0.8611321900136591, + "grad_norm": 0.3894922733306885, + "learning_rate": 8.56889743849347e-05, + "loss": 1.9101, + "step": 2837 + }, + { + "epoch": 0.8614357262103506, + "grad_norm": 0.3832629919052124, + "learning_rate": 8.568391211906451e-05, + "loss": 1.772, + "step": 2838 + }, + { + "epoch": 0.861739262407042, + "grad_norm": 0.4298574924468994, + "learning_rate": 8.56788498531943e-05, + "loss": 1.4439, + "step": 2839 + }, + { + "epoch": 0.8620427986037335, + "grad_norm": 0.44331902265548706, + "learning_rate": 8.56737875873241e-05, + "loss": 1.9716, + "step": 2840 + }, + { + "epoch": 0.862346334800425, + "grad_norm": 0.43073487281799316, + "learning_rate": 8.566872532145389e-05, + "loss": 2.1116, + "step": 2841 + }, + { + "epoch": 0.8626498709971164, + "grad_norm": 0.4528077244758606, + "learning_rate": 8.566366305558369e-05, + "loss": 1.8215, + "step": 2842 + }, + { + "epoch": 0.8629534071938079, + "grad_norm": 0.43540868163108826, + "learning_rate": 8.565860078971348e-05, + "loss": 1.9536, + "step": 2843 + }, + { + "epoch": 0.8632569433904993, + "grad_norm": 0.4424208998680115, + "learning_rate": 8.565353852384328e-05, + "loss": 2.0443, + "step": 2844 + }, + { + "epoch": 0.8635604795871907, + "grad_norm": 0.42500391602516174, + "learning_rate": 8.564847625797307e-05, + "loss": 1.7454, + "step": 2845 + }, + { + "epoch": 0.8638640157838823, + "grad_norm": 0.5110988020896912, + "learning_rate": 8.564341399210287e-05, + "loss": 1.9146, + "step": 2846 + }, + { + "epoch": 0.8641675519805737, + "grad_norm": 0.44191688299179077, + "learning_rate": 8.563835172623268e-05, + "loss": 2.0113, + "step": 2847 + }, + { + "epoch": 0.8644710881772651, + "grad_norm": 0.42467302083969116, + "learning_rate": 8.563328946036247e-05, + "loss": 1.8504, + "step": 2848 + }, + { + "epoch": 0.8647746243739566, + "grad_norm": 0.48334258794784546, + "learning_rate": 8.562822719449226e-05, + "loss": 1.9385, + "step": 2849 + }, + { + "epoch": 0.865078160570648, + "grad_norm": 0.42993229627609253, + "learning_rate": 8.562316492862206e-05, + "loss": 1.7958, + "step": 2850 + }, + { + "epoch": 0.8653816967673396, + "grad_norm": 0.391629695892334, + "learning_rate": 8.561810266275185e-05, + "loss": 1.7413, + "step": 2851 + }, + { + "epoch": 0.865685232964031, + "grad_norm": 0.46686479449272156, + "learning_rate": 8.561304039688165e-05, + "loss": 1.9155, + "step": 2852 + }, + { + "epoch": 0.8659887691607224, + "grad_norm": 0.41826534271240234, + "learning_rate": 8.560797813101144e-05, + "loss": 1.7625, + "step": 2853 + }, + { + "epoch": 0.8662923053574139, + "grad_norm": 0.42303943634033203, + "learning_rate": 8.560291586514124e-05, + "loss": 2.0604, + "step": 2854 + }, + { + "epoch": 0.8665958415541053, + "grad_norm": 0.42215773463249207, + "learning_rate": 8.559785359927103e-05, + "loss": 2.0058, + "step": 2855 + }, + { + "epoch": 0.8668993777507967, + "grad_norm": 0.45129135251045227, + "learning_rate": 8.559279133340083e-05, + "loss": 1.7881, + "step": 2856 + }, + { + "epoch": 0.8672029139474883, + "grad_norm": 0.41676831245422363, + "learning_rate": 8.558772906753064e-05, + "loss": 1.8624, + "step": 2857 + }, + { + "epoch": 0.8675064501441797, + "grad_norm": 0.4166240990161896, + "learning_rate": 8.558266680166043e-05, + "loss": 1.8828, + "step": 2858 + }, + { + "epoch": 0.8678099863408711, + "grad_norm": 0.407652348279953, + "learning_rate": 8.557760453579023e-05, + "loss": 1.7462, + "step": 2859 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.42764970660209656, + "learning_rate": 8.557254226992002e-05, + "loss": 1.607, + "step": 2860 + }, + { + "epoch": 0.868417058734254, + "grad_norm": 0.4612043797969818, + "learning_rate": 8.556748000404982e-05, + "loss": 1.8854, + "step": 2861 + }, + { + "epoch": 0.8687205949309456, + "grad_norm": 0.35503968596458435, + "learning_rate": 8.556241773817961e-05, + "loss": 1.6402, + "step": 2862 + }, + { + "epoch": 0.869024131127637, + "grad_norm": 0.3938760459423065, + "learning_rate": 8.55573554723094e-05, + "loss": 1.7154, + "step": 2863 + }, + { + "epoch": 0.8693276673243284, + "grad_norm": 0.6211029291152954, + "learning_rate": 8.55522932064392e-05, + "loss": 1.3675, + "step": 2864 + }, + { + "epoch": 0.8696312035210199, + "grad_norm": 0.4015885293483734, + "learning_rate": 8.5547230940569e-05, + "loss": 1.1773, + "step": 2865 + }, + { + "epoch": 0.8699347397177113, + "grad_norm": 0.39615508913993835, + "learning_rate": 8.55421686746988e-05, + "loss": 1.816, + "step": 2866 + }, + { + "epoch": 0.8702382759144028, + "grad_norm": 0.4044322669506073, + "learning_rate": 8.55371064088286e-05, + "loss": 1.5465, + "step": 2867 + }, + { + "epoch": 0.8705418121110943, + "grad_norm": 0.410137802362442, + "learning_rate": 8.553204414295839e-05, + "loss": 1.7645, + "step": 2868 + }, + { + "epoch": 0.8708453483077857, + "grad_norm": 0.456717312335968, + "learning_rate": 8.552698187708819e-05, + "loss": 1.9905, + "step": 2869 + }, + { + "epoch": 0.8711488845044771, + "grad_norm": 0.3661191761493683, + "learning_rate": 8.552191961121798e-05, + "loss": 2.1578, + "step": 2870 + }, + { + "epoch": 0.8714524207011686, + "grad_norm": 0.3868817389011383, + "learning_rate": 8.551685734534778e-05, + "loss": 1.9996, + "step": 2871 + }, + { + "epoch": 0.8717559568978601, + "grad_norm": 0.35258975625038147, + "learning_rate": 8.551179507947757e-05, + "loss": 1.5475, + "step": 2872 + }, + { + "epoch": 0.8720594930945516, + "grad_norm": 0.4110967516899109, + "learning_rate": 8.550673281360737e-05, + "loss": 1.7818, + "step": 2873 + }, + { + "epoch": 0.872363029291243, + "grad_norm": 0.39448168873786926, + "learning_rate": 8.550167054773716e-05, + "loss": 1.8685, + "step": 2874 + }, + { + "epoch": 0.8726665654879344, + "grad_norm": 0.5225607752799988, + "learning_rate": 8.549660828186697e-05, + "loss": 2.056, + "step": 2875 + }, + { + "epoch": 0.8729701016846259, + "grad_norm": 0.4417632818222046, + "learning_rate": 8.549154601599676e-05, + "loss": 1.8328, + "step": 2876 + }, + { + "epoch": 0.8732736378813174, + "grad_norm": 0.3205631673336029, + "learning_rate": 8.548648375012656e-05, + "loss": 1.2795, + "step": 2877 + }, + { + "epoch": 0.8735771740780088, + "grad_norm": 0.35961270332336426, + "learning_rate": 8.548142148425635e-05, + "loss": 1.9222, + "step": 2878 + }, + { + "epoch": 0.8738807102747003, + "grad_norm": 0.4819619059562683, + "learning_rate": 8.547635921838615e-05, + "loss": 2.1051, + "step": 2879 + }, + { + "epoch": 0.8741842464713917, + "grad_norm": 0.4361310601234436, + "learning_rate": 8.547129695251596e-05, + "loss": 1.3526, + "step": 2880 + }, + { + "epoch": 0.8744877826680831, + "grad_norm": 0.41012874245643616, + "learning_rate": 8.546623468664575e-05, + "loss": 1.4308, + "step": 2881 + }, + { + "epoch": 0.8747913188647746, + "grad_norm": 0.4581417441368103, + "learning_rate": 8.546117242077555e-05, + "loss": 2.0414, + "step": 2882 + }, + { + "epoch": 0.8750948550614661, + "grad_norm": 0.5409611463546753, + "learning_rate": 8.545611015490534e-05, + "loss": 1.3438, + "step": 2883 + }, + { + "epoch": 0.8753983912581575, + "grad_norm": 0.390472412109375, + "learning_rate": 8.545104788903514e-05, + "loss": 1.6817, + "step": 2884 + }, + { + "epoch": 0.875701927454849, + "grad_norm": 0.5236276984214783, + "learning_rate": 8.544598562316493e-05, + "loss": 1.7992, + "step": 2885 + }, + { + "epoch": 0.8760054636515404, + "grad_norm": 0.43483301997184753, + "learning_rate": 8.544092335729474e-05, + "loss": 1.8767, + "step": 2886 + }, + { + "epoch": 0.8763089998482319, + "grad_norm": 0.5605120658874512, + "learning_rate": 8.543586109142453e-05, + "loss": 1.5323, + "step": 2887 + }, + { + "epoch": 0.8766125360449234, + "grad_norm": 0.4484270215034485, + "learning_rate": 8.543079882555433e-05, + "loss": 1.9958, + "step": 2888 + }, + { + "epoch": 0.8769160722416148, + "grad_norm": 0.40156564116477966, + "learning_rate": 8.542573655968412e-05, + "loss": 1.8555, + "step": 2889 + }, + { + "epoch": 0.8772196084383063, + "grad_norm": 0.42205923795700073, + "learning_rate": 8.542067429381392e-05, + "loss": 1.5127, + "step": 2890 + }, + { + "epoch": 0.8775231446349977, + "grad_norm": 0.40961888432502747, + "learning_rate": 8.541561202794371e-05, + "loss": 1.7508, + "step": 2891 + }, + { + "epoch": 0.8778266808316891, + "grad_norm": 0.4366128742694855, + "learning_rate": 8.541054976207351e-05, + "loss": 1.9704, + "step": 2892 + }, + { + "epoch": 0.8781302170283807, + "grad_norm": 0.4367973804473877, + "learning_rate": 8.54054874962033e-05, + "loss": 1.978, + "step": 2893 + }, + { + "epoch": 0.8784337532250721, + "grad_norm": 0.4191198945045471, + "learning_rate": 8.54004252303331e-05, + "loss": 1.93, + "step": 2894 + }, + { + "epoch": 0.8787372894217635, + "grad_norm": 0.40298399329185486, + "learning_rate": 8.539536296446289e-05, + "loss": 1.8862, + "step": 2895 + }, + { + "epoch": 0.879040825618455, + "grad_norm": 0.4513075351715088, + "learning_rate": 8.53903006985927e-05, + "loss": 1.4634, + "step": 2896 + }, + { + "epoch": 0.8793443618151464, + "grad_norm": 0.452395498752594, + "learning_rate": 8.53852384327225e-05, + "loss": 2.0012, + "step": 2897 + }, + { + "epoch": 0.879647898011838, + "grad_norm": 0.4072858691215515, + "learning_rate": 8.538017616685229e-05, + "loss": 1.7729, + "step": 2898 + }, + { + "epoch": 0.8799514342085294, + "grad_norm": 0.42640551924705505, + "learning_rate": 8.537511390098209e-05, + "loss": 1.4541, + "step": 2899 + }, + { + "epoch": 0.8802549704052208, + "grad_norm": 0.37970346212387085, + "learning_rate": 8.537005163511188e-05, + "loss": 1.7628, + "step": 2900 + }, + { + "epoch": 0.8805585066019123, + "grad_norm": 0.4421388804912567, + "learning_rate": 8.536498936924168e-05, + "loss": 1.7712, + "step": 2901 + }, + { + "epoch": 0.8808620427986037, + "grad_norm": 0.42706549167633057, + "learning_rate": 8.535992710337147e-05, + "loss": 1.1601, + "step": 2902 + }, + { + "epoch": 0.8811655789952952, + "grad_norm": 0.42218390107154846, + "learning_rate": 8.535486483750126e-05, + "loss": 2.081, + "step": 2903 + }, + { + "epoch": 0.8814691151919867, + "grad_norm": 0.4469526410102844, + "learning_rate": 8.534980257163106e-05, + "loss": 1.9124, + "step": 2904 + }, + { + "epoch": 0.8817726513886781, + "grad_norm": 0.42796406149864197, + "learning_rate": 8.534474030576087e-05, + "loss": 1.8129, + "step": 2905 + }, + { + "epoch": 0.8820761875853695, + "grad_norm": 0.549192488193512, + "learning_rate": 8.533967803989066e-05, + "loss": 1.7816, + "step": 2906 + }, + { + "epoch": 0.882379723782061, + "grad_norm": 0.3347112834453583, + "learning_rate": 8.533461577402046e-05, + "loss": 2.0233, + "step": 2907 + }, + { + "epoch": 0.8826832599787525, + "grad_norm": 0.4557845890522003, + "learning_rate": 8.532955350815025e-05, + "loss": 1.8571, + "step": 2908 + }, + { + "epoch": 0.882986796175444, + "grad_norm": 0.9646681547164917, + "learning_rate": 8.532449124228005e-05, + "loss": 1.9618, + "step": 2909 + }, + { + "epoch": 0.8832903323721354, + "grad_norm": 0.43224748969078064, + "learning_rate": 8.531942897640984e-05, + "loss": 1.9805, + "step": 2910 + }, + { + "epoch": 0.8835938685688268, + "grad_norm": 0.635966420173645, + "learning_rate": 8.531436671053964e-05, + "loss": 1.8572, + "step": 2911 + }, + { + "epoch": 0.8838974047655183, + "grad_norm": 0.46912774443626404, + "learning_rate": 8.530930444466943e-05, + "loss": 1.9028, + "step": 2912 + }, + { + "epoch": 0.8842009409622097, + "grad_norm": 0.37521597743034363, + "learning_rate": 8.530424217879923e-05, + "loss": 1.7295, + "step": 2913 + }, + { + "epoch": 0.8845044771589012, + "grad_norm": 0.761882483959198, + "learning_rate": 8.529917991292903e-05, + "loss": 1.8966, + "step": 2914 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.7302446365356445, + "learning_rate": 8.529411764705883e-05, + "loss": 1.9641, + "step": 2915 + }, + { + "epoch": 0.8851115495522841, + "grad_norm": 0.47133728861808777, + "learning_rate": 8.528905538118862e-05, + "loss": 1.4893, + "step": 2916 + }, + { + "epoch": 0.8854150857489755, + "grad_norm": 0.4192088842391968, + "learning_rate": 8.528399311531842e-05, + "loss": 1.9727, + "step": 2917 + }, + { + "epoch": 0.885718621945667, + "grad_norm": 0.43679505586624146, + "learning_rate": 8.527893084944821e-05, + "loss": 1.7336, + "step": 2918 + }, + { + "epoch": 0.8860221581423585, + "grad_norm": 0.4226585328578949, + "learning_rate": 8.527386858357801e-05, + "loss": 2.0413, + "step": 2919 + }, + { + "epoch": 0.88632569433905, + "grad_norm": 0.5971319079399109, + "learning_rate": 8.52688063177078e-05, + "loss": 1.5096, + "step": 2920 + }, + { + "epoch": 0.8866292305357414, + "grad_norm": 0.5016753673553467, + "learning_rate": 8.52637440518376e-05, + "loss": 1.9617, + "step": 2921 + }, + { + "epoch": 0.8869327667324328, + "grad_norm": 0.4263944625854492, + "learning_rate": 8.525868178596739e-05, + "loss": 1.866, + "step": 2922 + }, + { + "epoch": 0.8872363029291243, + "grad_norm": 0.4154115915298462, + "learning_rate": 8.525361952009719e-05, + "loss": 2.0773, + "step": 2923 + }, + { + "epoch": 0.8875398391258158, + "grad_norm": 0.45908719301223755, + "learning_rate": 8.5248557254227e-05, + "loss": 1.9902, + "step": 2924 + }, + { + "epoch": 0.8878433753225072, + "grad_norm": 0.43351608514785767, + "learning_rate": 8.52434949883568e-05, + "loss": 1.7658, + "step": 2925 + }, + { + "epoch": 0.8881469115191987, + "grad_norm": 0.4773117005825043, + "learning_rate": 8.52384327224866e-05, + "loss": 1.8285, + "step": 2926 + }, + { + "epoch": 0.8884504477158901, + "grad_norm": 0.604767382144928, + "learning_rate": 8.52333704566164e-05, + "loss": 1.9066, + "step": 2927 + }, + { + "epoch": 0.8887539839125815, + "grad_norm": 1.4794889688491821, + "learning_rate": 8.522830819074619e-05, + "loss": 1.6015, + "step": 2928 + }, + { + "epoch": 0.8890575201092731, + "grad_norm": 0.9518802165985107, + "learning_rate": 8.522324592487598e-05, + "loss": 2.0818, + "step": 2929 + }, + { + "epoch": 0.8893610563059645, + "grad_norm": 0.9635084271430969, + "learning_rate": 8.521818365900578e-05, + "loss": 1.926, + "step": 2930 + }, + { + "epoch": 0.889664592502656, + "grad_norm": 0.4159846007823944, + "learning_rate": 8.521312139313557e-05, + "loss": 1.8518, + "step": 2931 + }, + { + "epoch": 0.8899681286993474, + "grad_norm": 0.42167580127716064, + "learning_rate": 8.520805912726537e-05, + "loss": 1.9682, + "step": 2932 + }, + { + "epoch": 0.8902716648960388, + "grad_norm": 0.4509316384792328, + "learning_rate": 8.520299686139516e-05, + "loss": 1.9891, + "step": 2933 + }, + { + "epoch": 0.8905752010927304, + "grad_norm": 0.3452865183353424, + "learning_rate": 8.519793459552496e-05, + "loss": 1.8003, + "step": 2934 + }, + { + "epoch": 0.8908787372894218, + "grad_norm": 0.38899463415145874, + "learning_rate": 8.519287232965477e-05, + "loss": 1.8849, + "step": 2935 + }, + { + "epoch": 0.8911822734861132, + "grad_norm": 0.4010523557662964, + "learning_rate": 8.518781006378456e-05, + "loss": 1.7905, + "step": 2936 + }, + { + "epoch": 0.8914858096828047, + "grad_norm": 0.3848381042480469, + "learning_rate": 8.518274779791436e-05, + "loss": 1.8661, + "step": 2937 + }, + { + "epoch": 0.8917893458794961, + "grad_norm": 0.41806578636169434, + "learning_rate": 8.517768553204415e-05, + "loss": 1.9617, + "step": 2938 + }, + { + "epoch": 0.8920928820761875, + "grad_norm": 0.4648883640766144, + "learning_rate": 8.517262326617395e-05, + "loss": 1.1984, + "step": 2939 + }, + { + "epoch": 0.8923964182728791, + "grad_norm": 0.43756723403930664, + "learning_rate": 8.516756100030374e-05, + "loss": 1.5523, + "step": 2940 + }, + { + "epoch": 0.8926999544695705, + "grad_norm": 0.393741637468338, + "learning_rate": 8.516249873443353e-05, + "loss": 1.8876, + "step": 2941 + }, + { + "epoch": 0.893003490666262, + "grad_norm": 0.41412442922592163, + "learning_rate": 8.515743646856333e-05, + "loss": 1.4302, + "step": 2942 + }, + { + "epoch": 0.8933070268629534, + "grad_norm": 0.4743058681488037, + "learning_rate": 8.515237420269312e-05, + "loss": 1.9108, + "step": 2943 + }, + { + "epoch": 0.8936105630596448, + "grad_norm": 0.40074145793914795, + "learning_rate": 8.514731193682293e-05, + "loss": 2.0512, + "step": 2944 + }, + { + "epoch": 0.8939140992563364, + "grad_norm": 0.39886727929115295, + "learning_rate": 8.514224967095273e-05, + "loss": 1.8853, + "step": 2945 + }, + { + "epoch": 0.8942176354530278, + "grad_norm": 0.9438028335571289, + "learning_rate": 8.513718740508252e-05, + "loss": 2.1271, + "step": 2946 + }, + { + "epoch": 0.8945211716497192, + "grad_norm": 0.38940876722335815, + "learning_rate": 8.513212513921232e-05, + "loss": 1.9008, + "step": 2947 + }, + { + "epoch": 0.8948247078464107, + "grad_norm": 0.3668425381183624, + "learning_rate": 8.512706287334211e-05, + "loss": 1.8479, + "step": 2948 + }, + { + "epoch": 0.8951282440431021, + "grad_norm": 0.41969189047813416, + "learning_rate": 8.512200060747191e-05, + "loss": 2.0179, + "step": 2949 + }, + { + "epoch": 0.8954317802397936, + "grad_norm": 0.377257376909256, + "learning_rate": 8.51169383416017e-05, + "loss": 1.8602, + "step": 2950 + }, + { + "epoch": 0.8957353164364851, + "grad_norm": 0.47926634550094604, + "learning_rate": 8.51118760757315e-05, + "loss": 1.9323, + "step": 2951 + }, + { + "epoch": 0.8960388526331765, + "grad_norm": 0.4736182689666748, + "learning_rate": 8.510681380986129e-05, + "loss": 1.7645, + "step": 2952 + }, + { + "epoch": 0.8963423888298679, + "grad_norm": 0.45783525705337524, + "learning_rate": 8.51017515439911e-05, + "loss": 2.0585, + "step": 2953 + }, + { + "epoch": 0.8966459250265594, + "grad_norm": 0.4085424840450287, + "learning_rate": 8.50966892781209e-05, + "loss": 1.8516, + "step": 2954 + }, + { + "epoch": 0.8969494612232509, + "grad_norm": 0.4012138545513153, + "learning_rate": 8.509162701225069e-05, + "loss": 1.9907, + "step": 2955 + }, + { + "epoch": 0.8972529974199424, + "grad_norm": 0.4017476737499237, + "learning_rate": 8.508656474638048e-05, + "loss": 1.9477, + "step": 2956 + }, + { + "epoch": 0.8975565336166338, + "grad_norm": 0.3720763325691223, + "learning_rate": 8.508150248051028e-05, + "loss": 1.9121, + "step": 2957 + }, + { + "epoch": 0.8978600698133252, + "grad_norm": 0.3642348349094391, + "learning_rate": 8.507644021464007e-05, + "loss": 1.8507, + "step": 2958 + }, + { + "epoch": 0.8981636060100167, + "grad_norm": 0.46299463510513306, + "learning_rate": 8.507137794876987e-05, + "loss": 2.0811, + "step": 2959 + }, + { + "epoch": 0.8984671422067082, + "grad_norm": 0.3806562125682831, + "learning_rate": 8.506631568289966e-05, + "loss": 1.6783, + "step": 2960 + }, + { + "epoch": 0.8987706784033996, + "grad_norm": 0.4003051221370697, + "learning_rate": 8.506125341702946e-05, + "loss": 1.7978, + "step": 2961 + }, + { + "epoch": 0.8990742146000911, + "grad_norm": 0.42008984088897705, + "learning_rate": 8.505619115115925e-05, + "loss": 1.9064, + "step": 2962 + }, + { + "epoch": 0.8993777507967825, + "grad_norm": 0.4423260986804962, + "learning_rate": 8.505112888528906e-05, + "loss": 1.9743, + "step": 2963 + }, + { + "epoch": 0.8996812869934739, + "grad_norm": 0.4516521990299225, + "learning_rate": 8.504606661941886e-05, + "loss": 1.6559, + "step": 2964 + }, + { + "epoch": 0.8999848231901654, + "grad_norm": 0.4269407093524933, + "learning_rate": 8.504100435354865e-05, + "loss": 1.677, + "step": 2965 + }, + { + "epoch": 0.9002883593868569, + "grad_norm": 0.4931739568710327, + "learning_rate": 8.503594208767845e-05, + "loss": 1.7786, + "step": 2966 + }, + { + "epoch": 0.9005918955835484, + "grad_norm": 0.4014637768268585, + "learning_rate": 8.503087982180824e-05, + "loss": 2.0737, + "step": 2967 + }, + { + "epoch": 0.9008954317802398, + "grad_norm": 0.4077427387237549, + "learning_rate": 8.502581755593804e-05, + "loss": 1.9301, + "step": 2968 + }, + { + "epoch": 0.9011989679769312, + "grad_norm": 0.40187394618988037, + "learning_rate": 8.502075529006784e-05, + "loss": 2.0045, + "step": 2969 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.6499014496803284, + "learning_rate": 8.501569302419764e-05, + "loss": 2.066, + "step": 2970 + }, + { + "epoch": 0.9018060403703142, + "grad_norm": 0.4545782804489136, + "learning_rate": 8.501063075832743e-05, + "loss": 1.7235, + "step": 2971 + }, + { + "epoch": 0.9021095765670056, + "grad_norm": 0.4248214066028595, + "learning_rate": 8.500556849245723e-05, + "loss": 2.0612, + "step": 2972 + }, + { + "epoch": 0.9024131127636971, + "grad_norm": 0.49832749366760254, + "learning_rate": 8.500050622658702e-05, + "loss": 1.9235, + "step": 2973 + }, + { + "epoch": 0.9027166489603885, + "grad_norm": 0.35065793991088867, + "learning_rate": 8.499544396071683e-05, + "loss": 1.7691, + "step": 2974 + }, + { + "epoch": 0.9030201851570799, + "grad_norm": 0.40507805347442627, + "learning_rate": 8.499038169484663e-05, + "loss": 2.055, + "step": 2975 + }, + { + "epoch": 0.9033237213537715, + "grad_norm": 0.44182345271110535, + "learning_rate": 8.498531942897642e-05, + "loss": 1.5353, + "step": 2976 + }, + { + "epoch": 0.9036272575504629, + "grad_norm": 0.4512852430343628, + "learning_rate": 8.498025716310622e-05, + "loss": 1.9116, + "step": 2977 + }, + { + "epoch": 0.9039307937471543, + "grad_norm": 0.44310954213142395, + "learning_rate": 8.497519489723601e-05, + "loss": 1.6875, + "step": 2978 + }, + { + "epoch": 0.9042343299438458, + "grad_norm": 0.4079609215259552, + "learning_rate": 8.49701326313658e-05, + "loss": 1.0174, + "step": 2979 + }, + { + "epoch": 0.9045378661405372, + "grad_norm": 0.3950175940990448, + "learning_rate": 8.49650703654956e-05, + "loss": 1.933, + "step": 2980 + }, + { + "epoch": 0.9048414023372288, + "grad_norm": 0.3858761787414551, + "learning_rate": 8.49600080996254e-05, + "loss": 1.6877, + "step": 2981 + }, + { + "epoch": 0.9051449385339202, + "grad_norm": 0.41248536109924316, + "learning_rate": 8.495494583375519e-05, + "loss": 1.1987, + "step": 2982 + }, + { + "epoch": 0.9054484747306116, + "grad_norm": 0.3943655490875244, + "learning_rate": 8.4949883567885e-05, + "loss": 1.5532, + "step": 2983 + }, + { + "epoch": 0.9057520109273031, + "grad_norm": 0.37889233231544495, + "learning_rate": 8.494482130201479e-05, + "loss": 1.8119, + "step": 2984 + }, + { + "epoch": 0.9060555471239945, + "grad_norm": 0.3723227381706238, + "learning_rate": 8.493975903614459e-05, + "loss": 1.8415, + "step": 2985 + }, + { + "epoch": 0.906359083320686, + "grad_norm": 0.4503065347671509, + "learning_rate": 8.493469677027438e-05, + "loss": 1.6841, + "step": 2986 + }, + { + "epoch": 0.9066626195173775, + "grad_norm": 0.41649529337882996, + "learning_rate": 8.492963450440418e-05, + "loss": 1.9298, + "step": 2987 + }, + { + "epoch": 0.9069661557140689, + "grad_norm": 0.3602710962295532, + "learning_rate": 8.492457223853397e-05, + "loss": 2.167, + "step": 2988 + }, + { + "epoch": 0.9072696919107603, + "grad_norm": 0.39875030517578125, + "learning_rate": 8.491950997266377e-05, + "loss": 2.1081, + "step": 2989 + }, + { + "epoch": 0.9075732281074518, + "grad_norm": 0.42908263206481934, + "learning_rate": 8.491444770679356e-05, + "loss": 1.7717, + "step": 2990 + }, + { + "epoch": 0.9078767643041433, + "grad_norm": 0.4125417470932007, + "learning_rate": 8.490938544092336e-05, + "loss": 2.2085, + "step": 2991 + }, + { + "epoch": 0.9081803005008348, + "grad_norm": 0.4204493463039398, + "learning_rate": 8.490432317505316e-05, + "loss": 1.7223, + "step": 2992 + }, + { + "epoch": 0.9084838366975262, + "grad_norm": 0.48912370204925537, + "learning_rate": 8.489926090918296e-05, + "loss": 1.7614, + "step": 2993 + }, + { + "epoch": 0.9087873728942176, + "grad_norm": 0.44855475425720215, + "learning_rate": 8.489419864331275e-05, + "loss": 1.9345, + "step": 2994 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.35656431317329407, + "learning_rate": 8.488913637744255e-05, + "loss": 1.3761, + "step": 2995 + }, + { + "epoch": 0.9093944452876005, + "grad_norm": 0.431643545627594, + "learning_rate": 8.488407411157234e-05, + "loss": 2.0976, + "step": 2996 + }, + { + "epoch": 0.909697981484292, + "grad_norm": 0.3734411299228668, + "learning_rate": 8.487901184570214e-05, + "loss": 1.5943, + "step": 2997 + }, + { + "epoch": 0.9100015176809835, + "grad_norm": 0.4734323024749756, + "learning_rate": 8.487394957983193e-05, + "loss": 1.9213, + "step": 2998 + }, + { + "epoch": 0.9103050538776749, + "grad_norm": 0.39105552434921265, + "learning_rate": 8.486888731396173e-05, + "loss": 1.7569, + "step": 2999 + }, + { + "epoch": 0.9106085900743663, + "grad_norm": 0.35980144143104553, + "learning_rate": 8.486382504809152e-05, + "loss": 1.4274, + "step": 3000 + }, + { + "epoch": 0.9109121262710578, + "grad_norm": 0.6007620692253113, + "learning_rate": 8.485876278222132e-05, + "loss": 1.7682, + "step": 3001 + }, + { + "epoch": 0.9112156624677493, + "grad_norm": 0.493499755859375, + "learning_rate": 8.485370051635113e-05, + "loss": 2.0509, + "step": 3002 + }, + { + "epoch": 0.9115191986644408, + "grad_norm": 1.5949187278747559, + "learning_rate": 8.484863825048092e-05, + "loss": 1.3541, + "step": 3003 + }, + { + "epoch": 0.9118227348611322, + "grad_norm": 0.4298441410064697, + "learning_rate": 8.484357598461072e-05, + "loss": 1.9893, + "step": 3004 + }, + { + "epoch": 0.9121262710578236, + "grad_norm": 0.4736660420894623, + "learning_rate": 8.483851371874051e-05, + "loss": 2.0483, + "step": 3005 + }, + { + "epoch": 0.9124298072545151, + "grad_norm": 0.39900222420692444, + "learning_rate": 8.48334514528703e-05, + "loss": 1.9998, + "step": 3006 + }, + { + "epoch": 0.9127333434512066, + "grad_norm": 0.43523257970809937, + "learning_rate": 8.48283891870001e-05, + "loss": 2.0205, + "step": 3007 + }, + { + "epoch": 0.913036879647898, + "grad_norm": 0.43466445803642273, + "learning_rate": 8.48233269211299e-05, + "loss": 2.0233, + "step": 3008 + }, + { + "epoch": 0.9133404158445895, + "grad_norm": 0.4107099175453186, + "learning_rate": 8.481826465525969e-05, + "loss": 1.8499, + "step": 3009 + }, + { + "epoch": 0.9136439520412809, + "grad_norm": 0.3902339041233063, + "learning_rate": 8.481320238938948e-05, + "loss": 1.6908, + "step": 3010 + }, + { + "epoch": 0.9139474882379723, + "grad_norm": 0.42095887660980225, + "learning_rate": 8.480814012351929e-05, + "loss": 1.8919, + "step": 3011 + }, + { + "epoch": 0.9142510244346639, + "grad_norm": 0.4260912537574768, + "learning_rate": 8.480307785764909e-05, + "loss": 1.8475, + "step": 3012 + }, + { + "epoch": 0.9145545606313553, + "grad_norm": 0.49090254306793213, + "learning_rate": 8.479801559177888e-05, + "loss": 1.284, + "step": 3013 + }, + { + "epoch": 0.9148580968280468, + "grad_norm": 0.5418572425842285, + "learning_rate": 8.479295332590869e-05, + "loss": 1.4133, + "step": 3014 + }, + { + "epoch": 0.9151616330247382, + "grad_norm": 0.46666690707206726, + "learning_rate": 8.478789106003849e-05, + "loss": 1.6623, + "step": 3015 + }, + { + "epoch": 0.9154651692214296, + "grad_norm": 0.3846968710422516, + "learning_rate": 8.478282879416828e-05, + "loss": 2.0276, + "step": 3016 + }, + { + "epoch": 0.9157687054181212, + "grad_norm": 0.3893693685531616, + "learning_rate": 8.477776652829807e-05, + "loss": 1.77, + "step": 3017 + }, + { + "epoch": 0.9160722416148126, + "grad_norm": 0.46749499440193176, + "learning_rate": 8.477270426242787e-05, + "loss": 1.9687, + "step": 3018 + }, + { + "epoch": 0.916375777811504, + "grad_norm": 0.5093361735343933, + "learning_rate": 8.476764199655766e-05, + "loss": 1.5275, + "step": 3019 + }, + { + "epoch": 0.9166793140081955, + "grad_norm": 0.415081650018692, + "learning_rate": 8.476257973068746e-05, + "loss": 2.1493, + "step": 3020 + }, + { + "epoch": 0.9169828502048869, + "grad_norm": 0.43510860204696655, + "learning_rate": 8.475751746481725e-05, + "loss": 2.1311, + "step": 3021 + }, + { + "epoch": 0.9172863864015783, + "grad_norm": 0.6513949036598206, + "learning_rate": 8.475245519894706e-05, + "loss": 1.6994, + "step": 3022 + }, + { + "epoch": 0.9175899225982699, + "grad_norm": 0.3848845362663269, + "learning_rate": 8.474739293307686e-05, + "loss": 1.9801, + "step": 3023 + }, + { + "epoch": 0.9178934587949613, + "grad_norm": 0.41188767552375793, + "learning_rate": 8.474233066720665e-05, + "loss": 1.8483, + "step": 3024 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.4070264399051666, + "learning_rate": 8.473726840133645e-05, + "loss": 1.7551, + "step": 3025 + }, + { + "epoch": 0.9185005311883442, + "grad_norm": 0.3943238854408264, + "learning_rate": 8.473220613546624e-05, + "loss": 1.9301, + "step": 3026 + }, + { + "epoch": 0.9188040673850356, + "grad_norm": 0.39565154910087585, + "learning_rate": 8.472714386959604e-05, + "loss": 1.9999, + "step": 3027 + }, + { + "epoch": 0.9191076035817272, + "grad_norm": 0.4589279592037201, + "learning_rate": 8.472208160372583e-05, + "loss": 1.6646, + "step": 3028 + }, + { + "epoch": 0.9194111397784186, + "grad_norm": 0.41473063826560974, + "learning_rate": 8.471701933785563e-05, + "loss": 1.611, + "step": 3029 + }, + { + "epoch": 0.91971467597511, + "grad_norm": 0.42082396149635315, + "learning_rate": 8.471195707198542e-05, + "loss": 1.923, + "step": 3030 + }, + { + "epoch": 0.9200182121718015, + "grad_norm": 0.5210884809494019, + "learning_rate": 8.470689480611523e-05, + "loss": 2.0562, + "step": 3031 + }, + { + "epoch": 0.9203217483684929, + "grad_norm": 0.462028831243515, + "learning_rate": 8.470183254024502e-05, + "loss": 2.1234, + "step": 3032 + }, + { + "epoch": 0.9206252845651844, + "grad_norm": 0.452634334564209, + "learning_rate": 8.469677027437482e-05, + "loss": 1.8217, + "step": 3033 + }, + { + "epoch": 0.9209288207618759, + "grad_norm": 0.4052957594394684, + "learning_rate": 8.469170800850461e-05, + "loss": 1.782, + "step": 3034 + }, + { + "epoch": 0.9212323569585673, + "grad_norm": 0.4169760048389435, + "learning_rate": 8.468664574263441e-05, + "loss": 1.9965, + "step": 3035 + }, + { + "epoch": 0.9215358931552587, + "grad_norm": 0.41070201992988586, + "learning_rate": 8.46815834767642e-05, + "loss": 1.4293, + "step": 3036 + }, + { + "epoch": 0.9218394293519502, + "grad_norm": 0.4272339940071106, + "learning_rate": 8.4676521210894e-05, + "loss": 1.7909, + "step": 3037 + }, + { + "epoch": 0.9221429655486417, + "grad_norm": 0.3842034935951233, + "learning_rate": 8.467145894502379e-05, + "loss": 1.8883, + "step": 3038 + }, + { + "epoch": 0.9224465017453332, + "grad_norm": 0.4226452112197876, + "learning_rate": 8.466639667915359e-05, + "loss": 1.8218, + "step": 3039 + }, + { + "epoch": 0.9227500379420246, + "grad_norm": 0.4224850833415985, + "learning_rate": 8.466133441328338e-05, + "loss": 1.833, + "step": 3040 + }, + { + "epoch": 0.923053574138716, + "grad_norm": 0.41073185205459595, + "learning_rate": 8.465627214741319e-05, + "loss": 2.0014, + "step": 3041 + }, + { + "epoch": 0.9233571103354075, + "grad_norm": 0.44303256273269653, + "learning_rate": 8.465120988154299e-05, + "loss": 2.058, + "step": 3042 + }, + { + "epoch": 0.923660646532099, + "grad_norm": 0.4708426594734192, + "learning_rate": 8.464614761567278e-05, + "loss": 1.7387, + "step": 3043 + }, + { + "epoch": 0.9239641827287904, + "grad_norm": 0.36340072751045227, + "learning_rate": 8.464108534980257e-05, + "loss": 1.7583, + "step": 3044 + }, + { + "epoch": 0.9242677189254819, + "grad_norm": 0.45171916484832764, + "learning_rate": 8.463602308393237e-05, + "loss": 1.5946, + "step": 3045 + }, + { + "epoch": 0.9245712551221733, + "grad_norm": 0.39633792638778687, + "learning_rate": 8.463096081806216e-05, + "loss": 1.9707, + "step": 3046 + }, + { + "epoch": 0.9248747913188647, + "grad_norm": 0.43228423595428467, + "learning_rate": 8.462589855219196e-05, + "loss": 1.9343, + "step": 3047 + }, + { + "epoch": 0.9251783275155562, + "grad_norm": 0.37613019347190857, + "learning_rate": 8.462083628632175e-05, + "loss": 1.9903, + "step": 3048 + }, + { + "epoch": 0.9254818637122477, + "grad_norm": 0.4591895341873169, + "learning_rate": 8.461577402045155e-05, + "loss": 1.6375, + "step": 3049 + }, + { + "epoch": 0.9257853999089392, + "grad_norm": 0.419057697057724, + "learning_rate": 8.461071175458136e-05, + "loss": 1.5849, + "step": 3050 + }, + { + "epoch": 0.9260889361056306, + "grad_norm": 0.4567117393016815, + "learning_rate": 8.460564948871115e-05, + "loss": 1.7519, + "step": 3051 + }, + { + "epoch": 0.926392472302322, + "grad_norm": 0.49061158299446106, + "learning_rate": 8.460058722284095e-05, + "loss": 2.0161, + "step": 3052 + }, + { + "epoch": 0.9266960084990135, + "grad_norm": 0.4232182502746582, + "learning_rate": 8.459552495697074e-05, + "loss": 1.8288, + "step": 3053 + }, + { + "epoch": 0.926999544695705, + "grad_norm": 0.4958473742008209, + "learning_rate": 8.459046269110054e-05, + "loss": 1.5602, + "step": 3054 + }, + { + "epoch": 0.9273030808923964, + "grad_norm": 0.4351857602596283, + "learning_rate": 8.458540042523033e-05, + "loss": 1.7898, + "step": 3055 + }, + { + "epoch": 0.9276066170890879, + "grad_norm": 0.3431285619735718, + "learning_rate": 8.458033815936013e-05, + "loss": 1.8609, + "step": 3056 + }, + { + "epoch": 0.9279101532857793, + "grad_norm": 0.40975016355514526, + "learning_rate": 8.457527589348992e-05, + "loss": 1.9818, + "step": 3057 + }, + { + "epoch": 0.9282136894824707, + "grad_norm": 0.42433828115463257, + "learning_rate": 8.457021362761973e-05, + "loss": 1.4293, + "step": 3058 + }, + { + "epoch": 0.9285172256791623, + "grad_norm": 0.37647899985313416, + "learning_rate": 8.456515136174952e-05, + "loss": 1.641, + "step": 3059 + }, + { + "epoch": 0.9288207618758537, + "grad_norm": 0.39693647623062134, + "learning_rate": 8.456008909587932e-05, + "loss": 1.6236, + "step": 3060 + }, + { + "epoch": 0.9291242980725452, + "grad_norm": 0.4057548940181732, + "learning_rate": 8.455502683000913e-05, + "loss": 1.7059, + "step": 3061 + }, + { + "epoch": 0.9294278342692366, + "grad_norm": 0.4264809787273407, + "learning_rate": 8.454996456413892e-05, + "loss": 1.9343, + "step": 3062 + }, + { + "epoch": 0.929731370465928, + "grad_norm": 0.4178743362426758, + "learning_rate": 8.454490229826872e-05, + "loss": 1.8086, + "step": 3063 + }, + { + "epoch": 0.9300349066626196, + "grad_norm": 0.4191538095474243, + "learning_rate": 8.453984003239851e-05, + "loss": 1.7379, + "step": 3064 + }, + { + "epoch": 0.930338442859311, + "grad_norm": 0.3979043960571289, + "learning_rate": 8.45347777665283e-05, + "loss": 1.193, + "step": 3065 + }, + { + "epoch": 0.9306419790560024, + "grad_norm": 0.49591994285583496, + "learning_rate": 8.45297155006581e-05, + "loss": 2.0942, + "step": 3066 + }, + { + "epoch": 0.9309455152526939, + "grad_norm": 0.37868285179138184, + "learning_rate": 8.45246532347879e-05, + "loss": 1.9563, + "step": 3067 + }, + { + "epoch": 0.9312490514493853, + "grad_norm": 0.6344965696334839, + "learning_rate": 8.451959096891769e-05, + "loss": 2.0734, + "step": 3068 + }, + { + "epoch": 0.9315525876460768, + "grad_norm": 0.3740929067134857, + "learning_rate": 8.451452870304749e-05, + "loss": 1.507, + "step": 3069 + }, + { + "epoch": 0.9318561238427683, + "grad_norm": 0.4502614736557007, + "learning_rate": 8.45094664371773e-05, + "loss": 2.2908, + "step": 3070 + }, + { + "epoch": 0.9321596600394597, + "grad_norm": 0.4272226393222809, + "learning_rate": 8.450440417130709e-05, + "loss": 2.0146, + "step": 3071 + }, + { + "epoch": 0.9324631962361511, + "grad_norm": 0.4830857217311859, + "learning_rate": 8.449934190543688e-05, + "loss": 1.9573, + "step": 3072 + }, + { + "epoch": 0.9327667324328426, + "grad_norm": 0.4085777997970581, + "learning_rate": 8.449427963956668e-05, + "loss": 1.7846, + "step": 3073 + }, + { + "epoch": 0.933070268629534, + "grad_norm": 0.41114342212677, + "learning_rate": 8.448921737369647e-05, + "loss": 1.8357, + "step": 3074 + }, + { + "epoch": 0.9333738048262256, + "grad_norm": 0.4729441702365875, + "learning_rate": 8.448415510782627e-05, + "loss": 1.7578, + "step": 3075 + }, + { + "epoch": 0.933677341022917, + "grad_norm": 0.5033143758773804, + "learning_rate": 8.447909284195606e-05, + "loss": 1.5804, + "step": 3076 + }, + { + "epoch": 0.9339808772196084, + "grad_norm": 0.39915481209754944, + "learning_rate": 8.447403057608586e-05, + "loss": 1.9723, + "step": 3077 + }, + { + "epoch": 0.9342844134162999, + "grad_norm": 0.48224571347236633, + "learning_rate": 8.446896831021565e-05, + "loss": 2.0484, + "step": 3078 + }, + { + "epoch": 0.9345879496129913, + "grad_norm": 0.4606854319572449, + "learning_rate": 8.446390604434545e-05, + "loss": 1.687, + "step": 3079 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.5270693898200989, + "learning_rate": 8.445884377847526e-05, + "loss": 1.9201, + "step": 3080 + }, + { + "epoch": 0.9351950220063743, + "grad_norm": 0.39141446352005005, + "learning_rate": 8.445378151260505e-05, + "loss": 1.8601, + "step": 3081 + }, + { + "epoch": 0.9354985582030657, + "grad_norm": 0.3756524920463562, + "learning_rate": 8.444871924673484e-05, + "loss": 1.5993, + "step": 3082 + }, + { + "epoch": 0.9358020943997571, + "grad_norm": 1.0641098022460938, + "learning_rate": 8.444365698086464e-05, + "loss": 1.7314, + "step": 3083 + }, + { + "epoch": 0.9361056305964486, + "grad_norm": 0.4183383584022522, + "learning_rate": 8.443859471499443e-05, + "loss": 1.5999, + "step": 3084 + }, + { + "epoch": 0.9364091667931401, + "grad_norm": 0.43228405714035034, + "learning_rate": 8.443353244912423e-05, + "loss": 1.7582, + "step": 3085 + }, + { + "epoch": 0.9367127029898316, + "grad_norm": 0.38039523363113403, + "learning_rate": 8.442847018325402e-05, + "loss": 1.9152, + "step": 3086 + }, + { + "epoch": 0.937016239186523, + "grad_norm": 0.3952110707759857, + "learning_rate": 8.442340791738382e-05, + "loss": 1.4944, + "step": 3087 + }, + { + "epoch": 0.9373197753832144, + "grad_norm": 0.40946584939956665, + "learning_rate": 8.441834565151361e-05, + "loss": 1.8466, + "step": 3088 + }, + { + "epoch": 0.9376233115799059, + "grad_norm": 0.3978438675403595, + "learning_rate": 8.441328338564342e-05, + "loss": 1.6802, + "step": 3089 + }, + { + "epoch": 0.9379268477765974, + "grad_norm": 0.4061533808708191, + "learning_rate": 8.440822111977322e-05, + "loss": 1.744, + "step": 3090 + }, + { + "epoch": 0.9382303839732888, + "grad_norm": 0.4017331004142761, + "learning_rate": 8.440315885390301e-05, + "loss": 1.6486, + "step": 3091 + }, + { + "epoch": 0.9385339201699803, + "grad_norm": 0.4937646687030792, + "learning_rate": 8.43980965880328e-05, + "loss": 1.9909, + "step": 3092 + }, + { + "epoch": 0.9388374563666717, + "grad_norm": 0.5023379921913147, + "learning_rate": 8.43930343221626e-05, + "loss": 1.5965, + "step": 3093 + }, + { + "epoch": 0.9391409925633631, + "grad_norm": 0.47406286001205444, + "learning_rate": 8.43879720562924e-05, + "loss": 1.7296, + "step": 3094 + }, + { + "epoch": 0.9394445287600547, + "grad_norm": 0.4132237136363983, + "learning_rate": 8.438290979042219e-05, + "loss": 2.0174, + "step": 3095 + }, + { + "epoch": 0.9397480649567461, + "grad_norm": 0.39927050471305847, + "learning_rate": 8.437784752455199e-05, + "loss": 1.9771, + "step": 3096 + }, + { + "epoch": 0.9400516011534376, + "grad_norm": 0.4007806181907654, + "learning_rate": 8.437278525868178e-05, + "loss": 2.0763, + "step": 3097 + }, + { + "epoch": 0.940355137350129, + "grad_norm": 0.45542803406715393, + "learning_rate": 8.436772299281159e-05, + "loss": 1.6179, + "step": 3098 + }, + { + "epoch": 0.9406586735468204, + "grad_norm": 0.4620111286640167, + "learning_rate": 8.436266072694138e-05, + "loss": 1.5975, + "step": 3099 + }, + { + "epoch": 0.940962209743512, + "grad_norm": 0.41753649711608887, + "learning_rate": 8.435759846107118e-05, + "loss": 1.8776, + "step": 3100 + }, + { + "epoch": 0.9412657459402034, + "grad_norm": 0.4777916371822357, + "learning_rate": 8.435253619520097e-05, + "loss": 1.503, + "step": 3101 + }, + { + "epoch": 0.9415692821368948, + "grad_norm": 0.3927490711212158, + "learning_rate": 8.434747392933077e-05, + "loss": 2.0409, + "step": 3102 + }, + { + "epoch": 0.9418728183335863, + "grad_norm": 0.4261821210384369, + "learning_rate": 8.434241166346058e-05, + "loss": 1.9552, + "step": 3103 + }, + { + "epoch": 0.9421763545302777, + "grad_norm": 0.4125726521015167, + "learning_rate": 8.433734939759037e-05, + "loss": 2.0691, + "step": 3104 + }, + { + "epoch": 0.9424798907269691, + "grad_norm": 0.46787095069885254, + "learning_rate": 8.433228713172017e-05, + "loss": 2.1579, + "step": 3105 + }, + { + "epoch": 0.9427834269236607, + "grad_norm": 0.47438356280326843, + "learning_rate": 8.432722486584996e-05, + "loss": 1.9795, + "step": 3106 + }, + { + "epoch": 0.9430869631203521, + "grad_norm": 0.39783594012260437, + "learning_rate": 8.432216259997976e-05, + "loss": 1.8703, + "step": 3107 + }, + { + "epoch": 0.9433904993170436, + "grad_norm": 0.3802180588245392, + "learning_rate": 8.431710033410955e-05, + "loss": 1.8415, + "step": 3108 + }, + { + "epoch": 0.943694035513735, + "grad_norm": 0.3957035541534424, + "learning_rate": 8.431203806823936e-05, + "loss": 1.8566, + "step": 3109 + }, + { + "epoch": 0.9439975717104264, + "grad_norm": 0.4278394281864166, + "learning_rate": 8.430697580236915e-05, + "loss": 1.5011, + "step": 3110 + }, + { + "epoch": 0.944301107907118, + "grad_norm": 0.4553667902946472, + "learning_rate": 8.430191353649895e-05, + "loss": 1.6098, + "step": 3111 + }, + { + "epoch": 0.9446046441038094, + "grad_norm": 0.47047415375709534, + "learning_rate": 8.429685127062874e-05, + "loss": 1.4019, + "step": 3112 + }, + { + "epoch": 0.9449081803005008, + "grad_norm": 0.4949280619621277, + "learning_rate": 8.429178900475854e-05, + "loss": 2.0078, + "step": 3113 + }, + { + "epoch": 0.9452117164971923, + "grad_norm": 0.43883949518203735, + "learning_rate": 8.428672673888833e-05, + "loss": 2.1833, + "step": 3114 + }, + { + "epoch": 0.9455152526938837, + "grad_norm": 0.44522538781166077, + "learning_rate": 8.428166447301813e-05, + "loss": 1.8481, + "step": 3115 + }, + { + "epoch": 0.9458187888905752, + "grad_norm": 0.4013282060623169, + "learning_rate": 8.427660220714792e-05, + "loss": 2.224, + "step": 3116 + }, + { + "epoch": 0.9461223250872667, + "grad_norm": 0.6805521249771118, + "learning_rate": 8.427153994127772e-05, + "loss": 1.3343, + "step": 3117 + }, + { + "epoch": 0.9464258612839581, + "grad_norm": 0.3767577111721039, + "learning_rate": 8.426647767540751e-05, + "loss": 1.9997, + "step": 3118 + }, + { + "epoch": 0.9467293974806495, + "grad_norm": 0.43915873765945435, + "learning_rate": 8.426141540953732e-05, + "loss": 1.7449, + "step": 3119 + }, + { + "epoch": 0.947032933677341, + "grad_norm": 0.4347434639930725, + "learning_rate": 8.425635314366711e-05, + "loss": 1.8674, + "step": 3120 + }, + { + "epoch": 0.9473364698740325, + "grad_norm": 0.39332154393196106, + "learning_rate": 8.425129087779691e-05, + "loss": 1.7705, + "step": 3121 + }, + { + "epoch": 0.947640006070724, + "grad_norm": 0.3791426122188568, + "learning_rate": 8.42462286119267e-05, + "loss": 2.0012, + "step": 3122 + }, + { + "epoch": 0.9479435422674154, + "grad_norm": 0.4299080967903137, + "learning_rate": 8.42411663460565e-05, + "loss": 2.0032, + "step": 3123 + }, + { + "epoch": 0.9482470784641068, + "grad_norm": 0.4709744155406952, + "learning_rate": 8.42361040801863e-05, + "loss": 1.6847, + "step": 3124 + }, + { + "epoch": 0.9485506146607983, + "grad_norm": 0.4098724126815796, + "learning_rate": 8.423104181431609e-05, + "loss": 1.6806, + "step": 3125 + }, + { + "epoch": 0.9488541508574898, + "grad_norm": 0.42105549573898315, + "learning_rate": 8.422597954844588e-05, + "loss": 2.0726, + "step": 3126 + }, + { + "epoch": 0.9491576870541812, + "grad_norm": 0.43502575159072876, + "learning_rate": 8.422091728257568e-05, + "loss": 2.081, + "step": 3127 + }, + { + "epoch": 0.9494612232508727, + "grad_norm": 2.2358832359313965, + "learning_rate": 8.421585501670549e-05, + "loss": 1.7987, + "step": 3128 + }, + { + "epoch": 0.9497647594475641, + "grad_norm": 0.4136318564414978, + "learning_rate": 8.421079275083528e-05, + "loss": 1.8534, + "step": 3129 + }, + { + "epoch": 0.9500682956442555, + "grad_norm": 0.4346698820590973, + "learning_rate": 8.420573048496508e-05, + "loss": 1.736, + "step": 3130 + }, + { + "epoch": 0.950371831840947, + "grad_norm": 0.4190434217453003, + "learning_rate": 8.420066821909487e-05, + "loss": 1.9535, + "step": 3131 + }, + { + "epoch": 0.9506753680376385, + "grad_norm": 0.41435617208480835, + "learning_rate": 8.419560595322467e-05, + "loss": 1.7954, + "step": 3132 + }, + { + "epoch": 0.95097890423433, + "grad_norm": 0.48440879583358765, + "learning_rate": 8.419054368735446e-05, + "loss": 1.8961, + "step": 3133 + }, + { + "epoch": 0.9512824404310214, + "grad_norm": 0.43945470452308655, + "learning_rate": 8.418548142148426e-05, + "loss": 1.6069, + "step": 3134 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.4803379774093628, + "learning_rate": 8.418041915561405e-05, + "loss": 1.763, + "step": 3135 + }, + { + "epoch": 0.9518895128244043, + "grad_norm": 0.6266827583312988, + "learning_rate": 8.417535688974385e-05, + "loss": 2.0856, + "step": 3136 + }, + { + "epoch": 0.9521930490210958, + "grad_norm": 0.39886969327926636, + "learning_rate": 8.417029462387365e-05, + "loss": 2.0025, + "step": 3137 + }, + { + "epoch": 0.9524965852177872, + "grad_norm": 0.4129003882408142, + "learning_rate": 8.416523235800345e-05, + "loss": 1.8249, + "step": 3138 + }, + { + "epoch": 0.9528001214144787, + "grad_norm": 0.501766562461853, + "learning_rate": 8.416017009213324e-05, + "loss": 1.5152, + "step": 3139 + }, + { + "epoch": 0.9531036576111701, + "grad_norm": 0.36676540970802307, + "learning_rate": 8.415510782626304e-05, + "loss": 1.714, + "step": 3140 + }, + { + "epoch": 0.9534071938078615, + "grad_norm": 0.3785346746444702, + "learning_rate": 8.415004556039283e-05, + "loss": 1.8275, + "step": 3141 + }, + { + "epoch": 0.9537107300045531, + "grad_norm": 0.495661199092865, + "learning_rate": 8.414498329452263e-05, + "loss": 2.2824, + "step": 3142 + }, + { + "epoch": 0.9540142662012445, + "grad_norm": 0.361187607049942, + "learning_rate": 8.413992102865242e-05, + "loss": 2.1639, + "step": 3143 + }, + { + "epoch": 0.954317802397936, + "grad_norm": 0.4101172983646393, + "learning_rate": 8.413485876278222e-05, + "loss": 1.8902, + "step": 3144 + }, + { + "epoch": 0.9546213385946274, + "grad_norm": 0.4362664222717285, + "learning_rate": 8.412979649691201e-05, + "loss": 1.5794, + "step": 3145 + }, + { + "epoch": 0.9549248747913188, + "grad_norm": 0.3917141258716583, + "learning_rate": 8.41247342310418e-05, + "loss": 1.9078, + "step": 3146 + }, + { + "epoch": 0.9552284109880104, + "grad_norm": 0.4394182562828064, + "learning_rate": 8.411967196517161e-05, + "loss": 1.7582, + "step": 3147 + }, + { + "epoch": 0.9555319471847018, + "grad_norm": 0.3726690709590912, + "learning_rate": 8.411460969930142e-05, + "loss": 2.0366, + "step": 3148 + }, + { + "epoch": 0.9558354833813932, + "grad_norm": 0.40689945220947266, + "learning_rate": 8.410954743343122e-05, + "loss": 1.9526, + "step": 3149 + }, + { + "epoch": 0.9561390195780847, + "grad_norm": 0.3834896385669708, + "learning_rate": 8.410448516756101e-05, + "loss": 1.8866, + "step": 3150 + }, + { + "epoch": 0.9564425557747761, + "grad_norm": 0.3632187247276306, + "learning_rate": 8.409942290169081e-05, + "loss": 1.8138, + "step": 3151 + }, + { + "epoch": 0.9567460919714676, + "grad_norm": 0.42645806074142456, + "learning_rate": 8.40943606358206e-05, + "loss": 1.8987, + "step": 3152 + }, + { + "epoch": 0.9570496281681591, + "grad_norm": 0.3693891763687134, + "learning_rate": 8.40892983699504e-05, + "loss": 1.8998, + "step": 3153 + }, + { + "epoch": 0.9573531643648505, + "grad_norm": 0.35439208149909973, + "learning_rate": 8.408423610408019e-05, + "loss": 1.5383, + "step": 3154 + }, + { + "epoch": 0.957656700561542, + "grad_norm": 0.4941239356994629, + "learning_rate": 8.407917383820999e-05, + "loss": 1.6191, + "step": 3155 + }, + { + "epoch": 0.9579602367582334, + "grad_norm": 0.4424782693386078, + "learning_rate": 8.407411157233978e-05, + "loss": 1.8852, + "step": 3156 + }, + { + "epoch": 0.9582637729549248, + "grad_norm": 0.42269936203956604, + "learning_rate": 8.406904930646958e-05, + "loss": 1.9435, + "step": 3157 + }, + { + "epoch": 0.9585673091516164, + "grad_norm": 0.48583322763442993, + "learning_rate": 8.406398704059938e-05, + "loss": 2.0429, + "step": 3158 + }, + { + "epoch": 0.9588708453483078, + "grad_norm": 0.3931976854801178, + "learning_rate": 8.405892477472918e-05, + "loss": 1.8182, + "step": 3159 + }, + { + "epoch": 0.9591743815449992, + "grad_norm": 0.4035438001155853, + "learning_rate": 8.405386250885897e-05, + "loss": 2.0072, + "step": 3160 + }, + { + "epoch": 0.9594779177416907, + "grad_norm": 0.4093266725540161, + "learning_rate": 8.404880024298877e-05, + "loss": 1.7673, + "step": 3161 + }, + { + "epoch": 0.9597814539383821, + "grad_norm": 0.455600380897522, + "learning_rate": 8.404373797711856e-05, + "loss": 2.065, + "step": 3162 + }, + { + "epoch": 0.9600849901350736, + "grad_norm": 0.3882180154323578, + "learning_rate": 8.403867571124836e-05, + "loss": 2.3081, + "step": 3163 + }, + { + "epoch": 0.9603885263317651, + "grad_norm": 0.4267047941684723, + "learning_rate": 8.403361344537815e-05, + "loss": 1.1691, + "step": 3164 + }, + { + "epoch": 0.9606920625284565, + "grad_norm": 0.3406231701374054, + "learning_rate": 8.402855117950795e-05, + "loss": 1.1152, + "step": 3165 + }, + { + "epoch": 0.960995598725148, + "grad_norm": 0.34307190775871277, + "learning_rate": 8.402348891363774e-05, + "loss": 1.4004, + "step": 3166 + }, + { + "epoch": 0.9612991349218394, + "grad_norm": 0.42091912031173706, + "learning_rate": 8.401842664776755e-05, + "loss": 1.736, + "step": 3167 + }, + { + "epoch": 0.9616026711185309, + "grad_norm": 0.7770476341247559, + "learning_rate": 8.401336438189735e-05, + "loss": 1.7832, + "step": 3168 + }, + { + "epoch": 0.9619062073152224, + "grad_norm": 0.428165078163147, + "learning_rate": 8.400830211602714e-05, + "loss": 1.1608, + "step": 3169 + }, + { + "epoch": 0.9622097435119138, + "grad_norm": 1.1989792585372925, + "learning_rate": 8.400323985015694e-05, + "loss": 1.8082, + "step": 3170 + }, + { + "epoch": 0.9625132797086052, + "grad_norm": 0.41019129753112793, + "learning_rate": 8.399817758428673e-05, + "loss": 1.9886, + "step": 3171 + }, + { + "epoch": 0.9628168159052967, + "grad_norm": 0.394325315952301, + "learning_rate": 8.399311531841653e-05, + "loss": 2.0017, + "step": 3172 + }, + { + "epoch": 0.9631203521019882, + "grad_norm": 0.42622506618499756, + "learning_rate": 8.398805305254632e-05, + "loss": 1.9955, + "step": 3173 + }, + { + "epoch": 0.9634238882986796, + "grad_norm": 0.6596471071243286, + "learning_rate": 8.398299078667612e-05, + "loss": 1.7009, + "step": 3174 + }, + { + "epoch": 0.9637274244953711, + "grad_norm": 0.4476582407951355, + "learning_rate": 8.397792852080591e-05, + "loss": 1.514, + "step": 3175 + }, + { + "epoch": 0.9640309606920625, + "grad_norm": 0.4172927737236023, + "learning_rate": 8.397286625493572e-05, + "loss": 1.8323, + "step": 3176 + }, + { + "epoch": 0.964334496888754, + "grad_norm": 0.46736040711402893, + "learning_rate": 8.396780398906551e-05, + "loss": 1.6481, + "step": 3177 + }, + { + "epoch": 0.9646380330854455, + "grad_norm": 0.39226916432380676, + "learning_rate": 8.396274172319531e-05, + "loss": 2.1195, + "step": 3178 + }, + { + "epoch": 0.9649415692821369, + "grad_norm": 0.44442611932754517, + "learning_rate": 8.39576794573251e-05, + "loss": 1.9057, + "step": 3179 + }, + { + "epoch": 0.9652451054788284, + "grad_norm": 0.46118879318237305, + "learning_rate": 8.39526171914549e-05, + "loss": 1.7676, + "step": 3180 + }, + { + "epoch": 0.9655486416755198, + "grad_norm": 0.38712286949157715, + "learning_rate": 8.394755492558469e-05, + "loss": 1.9661, + "step": 3181 + }, + { + "epoch": 0.9658521778722112, + "grad_norm": 0.3752710223197937, + "learning_rate": 8.394249265971449e-05, + "loss": 2.014, + "step": 3182 + }, + { + "epoch": 0.9661557140689028, + "grad_norm": 0.4489542245864868, + "learning_rate": 8.393743039384428e-05, + "loss": 1.374, + "step": 3183 + }, + { + "epoch": 0.9664592502655942, + "grad_norm": 0.7780880928039551, + "learning_rate": 8.393236812797408e-05, + "loss": 1.7041, + "step": 3184 + }, + { + "epoch": 0.9667627864622856, + "grad_norm": 0.3980183005332947, + "learning_rate": 8.392730586210387e-05, + "loss": 2.193, + "step": 3185 + }, + { + "epoch": 0.9670663226589771, + "grad_norm": 0.6425886750221252, + "learning_rate": 8.392224359623368e-05, + "loss": 2.0175, + "step": 3186 + }, + { + "epoch": 0.9673698588556685, + "grad_norm": 0.5367327928543091, + "learning_rate": 8.391718133036347e-05, + "loss": 1.6297, + "step": 3187 + }, + { + "epoch": 0.9676733950523599, + "grad_norm": 0.4846515357494354, + "learning_rate": 8.391211906449327e-05, + "loss": 2.1914, + "step": 3188 + }, + { + "epoch": 0.9679769312490515, + "grad_norm": 0.4136618971824646, + "learning_rate": 8.390705679862306e-05, + "loss": 1.7868, + "step": 3189 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.49537962675094604, + "learning_rate": 8.390199453275286e-05, + "loss": 1.5581, + "step": 3190 + }, + { + "epoch": 0.9685840036424344, + "grad_norm": 0.40194493532180786, + "learning_rate": 8.389693226688265e-05, + "loss": 1.8225, + "step": 3191 + }, + { + "epoch": 0.9688875398391258, + "grad_norm": 0.4222577214241028, + "learning_rate": 8.389187000101246e-05, + "loss": 2.1477, + "step": 3192 + }, + { + "epoch": 0.9691910760358172, + "grad_norm": 0.44375449419021606, + "learning_rate": 8.388680773514226e-05, + "loss": 1.8249, + "step": 3193 + }, + { + "epoch": 0.9694946122325088, + "grad_norm": 0.47520217299461365, + "learning_rate": 8.388174546927205e-05, + "loss": 1.6632, + "step": 3194 + }, + { + "epoch": 0.9697981484292002, + "grad_norm": 0.4534616470336914, + "learning_rate": 8.387668320340185e-05, + "loss": 1.6831, + "step": 3195 + }, + { + "epoch": 0.9701016846258916, + "grad_norm": 0.38130199909210205, + "learning_rate": 8.387162093753164e-05, + "loss": 1.4674, + "step": 3196 + }, + { + "epoch": 0.9704052208225831, + "grad_norm": 0.4485642611980438, + "learning_rate": 8.386655867166145e-05, + "loss": 1.6484, + "step": 3197 + }, + { + "epoch": 0.9707087570192745, + "grad_norm": 0.4354068636894226, + "learning_rate": 8.386149640579124e-05, + "loss": 2.1642, + "step": 3198 + }, + { + "epoch": 0.971012293215966, + "grad_norm": 0.635586678981781, + "learning_rate": 8.385643413992104e-05, + "loss": 2.143, + "step": 3199 + }, + { + "epoch": 0.9713158294126575, + "grad_norm": 0.411639004945755, + "learning_rate": 8.385137187405083e-05, + "loss": 1.6845, + "step": 3200 + }, + { + "epoch": 0.9716193656093489, + "grad_norm": 0.8328726887702942, + "learning_rate": 8.384630960818063e-05, + "loss": 1.9091, + "step": 3201 + }, + { + "epoch": 0.9719229018060404, + "grad_norm": 0.3916926383972168, + "learning_rate": 8.384124734231042e-05, + "loss": 1.921, + "step": 3202 + }, + { + "epoch": 0.9722264380027318, + "grad_norm": 0.5521138906478882, + "learning_rate": 8.383618507644022e-05, + "loss": 1.4916, + "step": 3203 + }, + { + "epoch": 0.9725299741994233, + "grad_norm": 0.4344598948955536, + "learning_rate": 8.383112281057001e-05, + "loss": 1.8373, + "step": 3204 + }, + { + "epoch": 0.9728335103961148, + "grad_norm": 0.5206196904182434, + "learning_rate": 8.382606054469981e-05, + "loss": 1.02, + "step": 3205 + }, + { + "epoch": 0.9731370465928062, + "grad_norm": 0.4448585510253906, + "learning_rate": 8.382099827882962e-05, + "loss": 1.9117, + "step": 3206 + }, + { + "epoch": 0.9734405827894976, + "grad_norm": 0.39624467492103577, + "learning_rate": 8.381593601295941e-05, + "loss": 1.8143, + "step": 3207 + }, + { + "epoch": 0.9737441189861891, + "grad_norm": 0.43617355823516846, + "learning_rate": 8.38108737470892e-05, + "loss": 2.0802, + "step": 3208 + }, + { + "epoch": 0.9740476551828806, + "grad_norm": 0.33979347348213196, + "learning_rate": 8.3805811481219e-05, + "loss": 1.801, + "step": 3209 + }, + { + "epoch": 0.974351191379572, + "grad_norm": 0.4414675831794739, + "learning_rate": 8.38007492153488e-05, + "loss": 1.7201, + "step": 3210 + }, + { + "epoch": 0.9746547275762635, + "grad_norm": 0.4153429865837097, + "learning_rate": 8.379568694947859e-05, + "loss": 1.9318, + "step": 3211 + }, + { + "epoch": 0.9749582637729549, + "grad_norm": 0.5069173574447632, + "learning_rate": 8.379062468360839e-05, + "loss": 1.6647, + "step": 3212 + }, + { + "epoch": 0.9752617999696463, + "grad_norm": 0.4137086868286133, + "learning_rate": 8.378556241773818e-05, + "loss": 1.7239, + "step": 3213 + }, + { + "epoch": 0.9755653361663378, + "grad_norm": 0.4533393383026123, + "learning_rate": 8.378050015186797e-05, + "loss": 1.5102, + "step": 3214 + }, + { + "epoch": 0.9758688723630293, + "grad_norm": 0.5293918251991272, + "learning_rate": 8.377543788599778e-05, + "loss": 1.7053, + "step": 3215 + }, + { + "epoch": 0.9761724085597208, + "grad_norm": 0.43977779150009155, + "learning_rate": 8.377037562012758e-05, + "loss": 1.7897, + "step": 3216 + }, + { + "epoch": 0.9764759447564122, + "grad_norm": 0.32830044627189636, + "learning_rate": 8.376531335425737e-05, + "loss": 1.4395, + "step": 3217 + }, + { + "epoch": 0.9767794809531036, + "grad_norm": 0.4043295085430145, + "learning_rate": 8.376025108838717e-05, + "loss": 1.8202, + "step": 3218 + }, + { + "epoch": 0.9770830171497951, + "grad_norm": 0.392102986574173, + "learning_rate": 8.375518882251696e-05, + "loss": 1.8039, + "step": 3219 + }, + { + "epoch": 0.9773865533464866, + "grad_norm": 0.3810761868953705, + "learning_rate": 8.375012655664676e-05, + "loss": 1.7554, + "step": 3220 + }, + { + "epoch": 0.977690089543178, + "grad_norm": 0.43835926055908203, + "learning_rate": 8.374506429077655e-05, + "loss": 1.3123, + "step": 3221 + }, + { + "epoch": 0.9779936257398695, + "grad_norm": 0.40104183554649353, + "learning_rate": 8.374000202490635e-05, + "loss": 1.9954, + "step": 3222 + }, + { + "epoch": 0.9782971619365609, + "grad_norm": 0.423921138048172, + "learning_rate": 8.373493975903614e-05, + "loss": 1.8166, + "step": 3223 + }, + { + "epoch": 0.9786006981332523, + "grad_norm": 0.3636658191680908, + "learning_rate": 8.372987749316594e-05, + "loss": 1.7822, + "step": 3224 + }, + { + "epoch": 0.9789042343299439, + "grad_norm": 0.5033218860626221, + "learning_rate": 8.372481522729574e-05, + "loss": 1.9422, + "step": 3225 + }, + { + "epoch": 0.9792077705266353, + "grad_norm": 0.43753013014793396, + "learning_rate": 8.371975296142554e-05, + "loss": 1.8121, + "step": 3226 + }, + { + "epoch": 0.9795113067233268, + "grad_norm": 0.40286412835121155, + "learning_rate": 8.371469069555533e-05, + "loss": 1.3284, + "step": 3227 + }, + { + "epoch": 0.9798148429200182, + "grad_norm": 0.4499688148498535, + "learning_rate": 8.370962842968513e-05, + "loss": 2.001, + "step": 3228 + }, + { + "epoch": 0.9801183791167096, + "grad_norm": 0.4191727042198181, + "learning_rate": 8.370456616381492e-05, + "loss": 1.822, + "step": 3229 + }, + { + "epoch": 0.9804219153134012, + "grad_norm": 0.5225554704666138, + "learning_rate": 8.369950389794472e-05, + "loss": 1.7851, + "step": 3230 + }, + { + "epoch": 0.9807254515100926, + "grad_norm": 0.48582664132118225, + "learning_rate": 8.369444163207451e-05, + "loss": 1.7945, + "step": 3231 + }, + { + "epoch": 0.981028987706784, + "grad_norm": 0.39768776297569275, + "learning_rate": 8.368937936620431e-05, + "loss": 1.4223, + "step": 3232 + }, + { + "epoch": 0.9813325239034755, + "grad_norm": 0.38653451204299927, + "learning_rate": 8.36843171003341e-05, + "loss": 1.913, + "step": 3233 + }, + { + "epoch": 0.9816360601001669, + "grad_norm": 0.42827606201171875, + "learning_rate": 8.367925483446391e-05, + "loss": 1.8324, + "step": 3234 + }, + { + "epoch": 0.9819395962968585, + "grad_norm": 0.5108979344367981, + "learning_rate": 8.36741925685937e-05, + "loss": 1.8669, + "step": 3235 + }, + { + "epoch": 0.9822431324935499, + "grad_norm": 0.49551811814308167, + "learning_rate": 8.366913030272351e-05, + "loss": 1.8174, + "step": 3236 + }, + { + "epoch": 0.9825466686902413, + "grad_norm": 0.3723476529121399, + "learning_rate": 8.366406803685331e-05, + "loss": 2.2496, + "step": 3237 + }, + { + "epoch": 0.9828502048869328, + "grad_norm": 0.4076946973800659, + "learning_rate": 8.36590057709831e-05, + "loss": 2.0584, + "step": 3238 + }, + { + "epoch": 0.9831537410836242, + "grad_norm": 0.3642696440219879, + "learning_rate": 8.36539435051129e-05, + "loss": 1.9793, + "step": 3239 + }, + { + "epoch": 0.9834572772803156, + "grad_norm": 0.3629249632358551, + "learning_rate": 8.36488812392427e-05, + "loss": 1.3264, + "step": 3240 + }, + { + "epoch": 0.9837608134770072, + "grad_norm": 0.3478296399116516, + "learning_rate": 8.364381897337249e-05, + "loss": 1.7251, + "step": 3241 + }, + { + "epoch": 0.9840643496736986, + "grad_norm": 0.3987084627151489, + "learning_rate": 8.363875670750228e-05, + "loss": 1.8442, + "step": 3242 + }, + { + "epoch": 0.98436788587039, + "grad_norm": 0.4380822777748108, + "learning_rate": 8.363369444163208e-05, + "loss": 1.9684, + "step": 3243 + }, + { + "epoch": 0.9846714220670815, + "grad_norm": 0.3827231824398041, + "learning_rate": 8.362863217576187e-05, + "loss": 1.3639, + "step": 3244 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.39212536811828613, + "learning_rate": 8.362356990989168e-05, + "loss": 1.8493, + "step": 3245 + }, + { + "epoch": 0.9852784944604644, + "grad_norm": 0.40932169556617737, + "learning_rate": 8.361850764402148e-05, + "loss": 1.9505, + "step": 3246 + }, + { + "epoch": 0.9855820306571559, + "grad_norm": 0.40743735432624817, + "learning_rate": 8.361344537815127e-05, + "loss": 1.6057, + "step": 3247 + }, + { + "epoch": 0.9858855668538473, + "grad_norm": 0.44995588064193726, + "learning_rate": 8.360838311228107e-05, + "loss": 2.0585, + "step": 3248 + }, + { + "epoch": 0.9861891030505388, + "grad_norm": 0.40385690331459045, + "learning_rate": 8.360332084641086e-05, + "loss": 1.9227, + "step": 3249 + }, + { + "epoch": 0.9864926392472302, + "grad_norm": 0.3181687891483307, + "learning_rate": 8.359825858054065e-05, + "loss": 1.8171, + "step": 3250 + }, + { + "epoch": 0.9867961754439217, + "grad_norm": 0.5163337588310242, + "learning_rate": 8.359319631467045e-05, + "loss": 2.0483, + "step": 3251 + }, + { + "epoch": 0.9870997116406132, + "grad_norm": 0.3739945888519287, + "learning_rate": 8.358813404880024e-05, + "loss": 1.8311, + "step": 3252 + }, + { + "epoch": 0.9874032478373046, + "grad_norm": 0.38173356652259827, + "learning_rate": 8.358307178293004e-05, + "loss": 2.2227, + "step": 3253 + }, + { + "epoch": 0.987706784033996, + "grad_norm": 0.5024820566177368, + "learning_rate": 8.357800951705985e-05, + "loss": 1.4709, + "step": 3254 + }, + { + "epoch": 0.9880103202306875, + "grad_norm": 0.467106431722641, + "learning_rate": 8.357294725118964e-05, + "loss": 1.9105, + "step": 3255 + }, + { + "epoch": 0.988313856427379, + "grad_norm": 0.34657251834869385, + "learning_rate": 8.356788498531944e-05, + "loss": 1.7206, + "step": 3256 + }, + { + "epoch": 0.9886173926240704, + "grad_norm": 0.44770774245262146, + "learning_rate": 8.356282271944923e-05, + "loss": 1.8675, + "step": 3257 + }, + { + "epoch": 0.9889209288207619, + "grad_norm": 0.45685702562332153, + "learning_rate": 8.355776045357903e-05, + "loss": 2.0606, + "step": 3258 + }, + { + "epoch": 0.9892244650174533, + "grad_norm": 0.464417964220047, + "learning_rate": 8.355269818770882e-05, + "loss": 1.7514, + "step": 3259 + }, + { + "epoch": 0.9895280012141447, + "grad_norm": 0.3830156624317169, + "learning_rate": 8.354763592183862e-05, + "loss": 1.9846, + "step": 3260 + }, + { + "epoch": 0.9898315374108363, + "grad_norm": 0.9585968255996704, + "learning_rate": 8.354257365596841e-05, + "loss": 1.5536, + "step": 3261 + }, + { + "epoch": 0.9901350736075277, + "grad_norm": 0.5059092044830322, + "learning_rate": 8.35375113900982e-05, + "loss": 1.6083, + "step": 3262 + }, + { + "epoch": 0.9904386098042192, + "grad_norm": 0.4165644943714142, + "learning_rate": 8.3532449124228e-05, + "loss": 1.8939, + "step": 3263 + }, + { + "epoch": 0.9907421460009106, + "grad_norm": 0.4935527443885803, + "learning_rate": 8.352738685835781e-05, + "loss": 1.4373, + "step": 3264 + }, + { + "epoch": 0.991045682197602, + "grad_norm": 0.37208595871925354, + "learning_rate": 8.35223245924876e-05, + "loss": 1.9555, + "step": 3265 + }, + { + "epoch": 0.9913492183942936, + "grad_norm": 0.4575968384742737, + "learning_rate": 8.35172623266174e-05, + "loss": 1.5079, + "step": 3266 + }, + { + "epoch": 0.991652754590985, + "grad_norm": 0.7772040963172913, + "learning_rate": 8.35122000607472e-05, + "loss": 1.5094, + "step": 3267 + }, + { + "epoch": 0.9919562907876764, + "grad_norm": 0.3907686173915863, + "learning_rate": 8.350713779487699e-05, + "loss": 1.8795, + "step": 3268 + }, + { + "epoch": 0.9922598269843679, + "grad_norm": 0.40200579166412354, + "learning_rate": 8.350207552900678e-05, + "loss": 1.9728, + "step": 3269 + }, + { + "epoch": 0.9925633631810593, + "grad_norm": 0.8831415772438049, + "learning_rate": 8.349701326313658e-05, + "loss": 1.9289, + "step": 3270 + }, + { + "epoch": 0.9928668993777507, + "grad_norm": 0.4205533266067505, + "learning_rate": 8.349195099726637e-05, + "loss": 1.6273, + "step": 3271 + }, + { + "epoch": 0.9931704355744423, + "grad_norm": 0.3926026225090027, + "learning_rate": 8.348688873139617e-05, + "loss": 1.9261, + "step": 3272 + }, + { + "epoch": 0.9934739717711337, + "grad_norm": 0.4113319516181946, + "learning_rate": 8.348182646552598e-05, + "loss": 1.8568, + "step": 3273 + }, + { + "epoch": 0.9937775079678252, + "grad_norm": 0.4487648904323578, + "learning_rate": 8.347676419965577e-05, + "loss": 1.5191, + "step": 3274 + }, + { + "epoch": 0.9940810441645166, + "grad_norm": 0.5642136335372925, + "learning_rate": 8.347170193378557e-05, + "loss": 1.5365, + "step": 3275 + }, + { + "epoch": 0.994384580361208, + "grad_norm": 0.3658483624458313, + "learning_rate": 8.346663966791536e-05, + "loss": 2.0491, + "step": 3276 + }, + { + "epoch": 0.9946881165578996, + "grad_norm": 0.397892564535141, + "learning_rate": 8.346157740204516e-05, + "loss": 2.0732, + "step": 3277 + }, + { + "epoch": 0.994991652754591, + "grad_norm": 0.39073920249938965, + "learning_rate": 8.345651513617495e-05, + "loss": 1.4625, + "step": 3278 + }, + { + "epoch": 0.9952951889512824, + "grad_norm": 0.46809303760528564, + "learning_rate": 8.345145287030474e-05, + "loss": 1.6965, + "step": 3279 + }, + { + "epoch": 0.9955987251479739, + "grad_norm": 0.5772141814231873, + "learning_rate": 8.344639060443454e-05, + "loss": 2.0566, + "step": 3280 + }, + { + "epoch": 0.9959022613446653, + "grad_norm": 0.43104979395866394, + "learning_rate": 8.344132833856435e-05, + "loss": 1.8229, + "step": 3281 + }, + { + "epoch": 0.9962057975413569, + "grad_norm": 0.4449026882648468, + "learning_rate": 8.343626607269414e-05, + "loss": 2.1477, + "step": 3282 + }, + { + "epoch": 0.9965093337380483, + "grad_norm": 0.5293501019477844, + "learning_rate": 8.343120380682394e-05, + "loss": 1.6738, + "step": 3283 + }, + { + "epoch": 0.9968128699347397, + "grad_norm": 0.33359965682029724, + "learning_rate": 8.342614154095375e-05, + "loss": 1.0495, + "step": 3284 + }, + { + "epoch": 0.9971164061314312, + "grad_norm": 0.3978114128112793, + "learning_rate": 8.342107927508354e-05, + "loss": 1.9208, + "step": 3285 + }, + { + "epoch": 0.9974199423281226, + "grad_norm": 0.8589026927947998, + "learning_rate": 8.341601700921334e-05, + "loss": 2.0542, + "step": 3286 + }, + { + "epoch": 0.9977234785248141, + "grad_norm": 0.44943809509277344, + "learning_rate": 8.341095474334313e-05, + "loss": 1.5192, + "step": 3287 + }, + { + "epoch": 0.9980270147215056, + "grad_norm": 0.4497203826904297, + "learning_rate": 8.340589247747292e-05, + "loss": 1.9164, + "step": 3288 + }, + { + "epoch": 0.998330550918197, + "grad_norm": 0.4665476977825165, + "learning_rate": 8.340083021160272e-05, + "loss": 1.7622, + "step": 3289 + }, + { + "epoch": 0.9986340871148884, + "grad_norm": 0.43909046053886414, + "learning_rate": 8.339576794573251e-05, + "loss": 1.9663, + "step": 3290 + }, + { + "epoch": 0.9989376233115799, + "grad_norm": 0.401471883058548, + "learning_rate": 8.339070567986231e-05, + "loss": 1.9976, + "step": 3291 + }, + { + "epoch": 0.9992411595082714, + "grad_norm": 0.41323089599609375, + "learning_rate": 8.33856434139921e-05, + "loss": 1.9335, + "step": 3292 + }, + { + "epoch": 0.9995446957049628, + "grad_norm": 0.5027084350585938, + "learning_rate": 8.33805811481219e-05, + "loss": 1.9357, + "step": 3293 + }, + { + "epoch": 0.9998482319016543, + "grad_norm": 0.4065913259983063, + "learning_rate": 8.337551888225171e-05, + "loss": 1.7295, + "step": 3294 + }, + { + "epoch": 1.0001517680983458, + "grad_norm": 0.9244177341461182, + "learning_rate": 8.33704566163815e-05, + "loss": 1.8995, + "step": 3295 + }, + { + "epoch": 1.0004553042950373, + "grad_norm": 0.4463343024253845, + "learning_rate": 8.33653943505113e-05, + "loss": 1.4706, + "step": 3296 + }, + { + "epoch": 1.0007588404917287, + "grad_norm": 0.38846856355667114, + "learning_rate": 8.336033208464109e-05, + "loss": 1.5425, + "step": 3297 + }, + { + "epoch": 1.0010623766884201, + "grad_norm": 0.4786315858364105, + "learning_rate": 8.335526981877089e-05, + "loss": 1.4659, + "step": 3298 + }, + { + "epoch": 1.0013659128851116, + "grad_norm": 0.42400819063186646, + "learning_rate": 8.335020755290068e-05, + "loss": 1.8604, + "step": 3299 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.5442892909049988, + "learning_rate": 8.334514528703048e-05, + "loss": 1.1405, + "step": 3300 + }, + { + "epoch": 1.0019729852784944, + "grad_norm": 0.43166840076446533, + "learning_rate": 8.334008302116027e-05, + "loss": 1.6397, + "step": 3301 + }, + { + "epoch": 1.0022765214751859, + "grad_norm": 0.5023279786109924, + "learning_rate": 8.333502075529007e-05, + "loss": 1.8333, + "step": 3302 + }, + { + "epoch": 1.0025800576718773, + "grad_norm": 0.9390707015991211, + "learning_rate": 8.332995848941987e-05, + "loss": 1.0042, + "step": 3303 + }, + { + "epoch": 1.0028835938685687, + "grad_norm": 0.4091005027294159, + "learning_rate": 8.332489622354967e-05, + "loss": 1.6559, + "step": 3304 + }, + { + "epoch": 1.0031871300652604, + "grad_norm": 1.1399965286254883, + "learning_rate": 8.331983395767946e-05, + "loss": 1.2274, + "step": 3305 + }, + { + "epoch": 1.0034906662619518, + "grad_norm": 0.4626876711845398, + "learning_rate": 8.331477169180926e-05, + "loss": 1.7368, + "step": 3306 + }, + { + "epoch": 1.0037942024586433, + "grad_norm": 0.6136215329170227, + "learning_rate": 8.330970942593905e-05, + "loss": 1.442, + "step": 3307 + }, + { + "epoch": 1.0040977386553347, + "grad_norm": 0.7071730494499207, + "learning_rate": 8.330464716006885e-05, + "loss": 1.2879, + "step": 3308 + }, + { + "epoch": 1.0044012748520261, + "grad_norm": 0.7924762964248657, + "learning_rate": 8.329958489419864e-05, + "loss": 0.8643, + "step": 3309 + }, + { + "epoch": 1.0047048110487176, + "grad_norm": 0.42483755946159363, + "learning_rate": 8.329452262832844e-05, + "loss": 1.2481, + "step": 3310 + }, + { + "epoch": 1.005008347245409, + "grad_norm": 0.472868412733078, + "learning_rate": 8.328946036245823e-05, + "loss": 1.5925, + "step": 3311 + }, + { + "epoch": 1.0053118834421004, + "grad_norm": 0.43884944915771484, + "learning_rate": 8.328439809658804e-05, + "loss": 1.6212, + "step": 3312 + }, + { + "epoch": 1.0056154196387919, + "grad_norm": 0.8129292130470276, + "learning_rate": 8.327933583071784e-05, + "loss": 1.1515, + "step": 3313 + }, + { + "epoch": 1.0059189558354833, + "grad_norm": 0.5763627886772156, + "learning_rate": 8.327427356484763e-05, + "loss": 1.8529, + "step": 3314 + }, + { + "epoch": 1.0062224920321747, + "grad_norm": 0.7095117568969727, + "learning_rate": 8.326921129897743e-05, + "loss": 1.5395, + "step": 3315 + }, + { + "epoch": 1.0065260282288664, + "grad_norm": 0.4236385226249695, + "learning_rate": 8.326414903310722e-05, + "loss": 1.8515, + "step": 3316 + }, + { + "epoch": 1.0068295644255578, + "grad_norm": 0.49490535259246826, + "learning_rate": 8.325908676723701e-05, + "loss": 1.3917, + "step": 3317 + }, + { + "epoch": 1.0071331006222493, + "grad_norm": 0.4824042022228241, + "learning_rate": 8.325402450136681e-05, + "loss": 1.4685, + "step": 3318 + }, + { + "epoch": 1.0074366368189407, + "grad_norm": 0.3797103464603424, + "learning_rate": 8.32489622354966e-05, + "loss": 1.2106, + "step": 3319 + }, + { + "epoch": 1.0077401730156321, + "grad_norm": 0.3965649902820587, + "learning_rate": 8.32438999696264e-05, + "loss": 1.9664, + "step": 3320 + }, + { + "epoch": 1.0080437092123236, + "grad_norm": 0.45846912264823914, + "learning_rate": 8.323883770375621e-05, + "loss": 1.6475, + "step": 3321 + }, + { + "epoch": 1.008347245409015, + "grad_norm": 0.4603506326675415, + "learning_rate": 8.3233775437886e-05, + "loss": 1.6987, + "step": 3322 + }, + { + "epoch": 1.0086507816057064, + "grad_norm": 0.4500599801540375, + "learning_rate": 8.32287131720158e-05, + "loss": 1.5708, + "step": 3323 + }, + { + "epoch": 1.0089543178023979, + "grad_norm": 0.3444702923297882, + "learning_rate": 8.322365090614559e-05, + "loss": 1.0653, + "step": 3324 + }, + { + "epoch": 1.0092578539990893, + "grad_norm": 0.6663349270820618, + "learning_rate": 8.32185886402754e-05, + "loss": 1.6385, + "step": 3325 + }, + { + "epoch": 1.009561390195781, + "grad_norm": 0.41944435238838196, + "learning_rate": 8.32135263744052e-05, + "loss": 1.6023, + "step": 3326 + }, + { + "epoch": 1.0098649263924724, + "grad_norm": 0.45441389083862305, + "learning_rate": 8.320846410853499e-05, + "loss": 1.4627, + "step": 3327 + }, + { + "epoch": 1.0101684625891638, + "grad_norm": 0.4097443222999573, + "learning_rate": 8.320340184266478e-05, + "loss": 1.7927, + "step": 3328 + }, + { + "epoch": 1.0104719987858553, + "grad_norm": 0.4613991677761078, + "learning_rate": 8.319833957679458e-05, + "loss": 1.8251, + "step": 3329 + }, + { + "epoch": 1.0107755349825467, + "grad_norm": 0.43065598607063293, + "learning_rate": 8.319327731092437e-05, + "loss": 1.5799, + "step": 3330 + }, + { + "epoch": 1.0110790711792381, + "grad_norm": 0.44765231013298035, + "learning_rate": 8.318821504505417e-05, + "loss": 1.2663, + "step": 3331 + }, + { + "epoch": 1.0113826073759296, + "grad_norm": 0.4792670011520386, + "learning_rate": 8.318315277918396e-05, + "loss": 1.6675, + "step": 3332 + }, + { + "epoch": 1.011686143572621, + "grad_norm": 0.46904540061950684, + "learning_rate": 8.317809051331377e-05, + "loss": 1.6204, + "step": 3333 + }, + { + "epoch": 1.0119896797693124, + "grad_norm": 0.43783873319625854, + "learning_rate": 8.317302824744357e-05, + "loss": 1.8069, + "step": 3334 + }, + { + "epoch": 1.0122932159660039, + "grad_norm": 0.4738599359989166, + "learning_rate": 8.316796598157336e-05, + "loss": 1.8303, + "step": 3335 + }, + { + "epoch": 1.0125967521626955, + "grad_norm": 0.46483105421066284, + "learning_rate": 8.316290371570316e-05, + "loss": 1.3623, + "step": 3336 + }, + { + "epoch": 1.012900288359387, + "grad_norm": 0.3990911543369293, + "learning_rate": 8.315784144983295e-05, + "loss": 1.4985, + "step": 3337 + }, + { + "epoch": 1.0132038245560784, + "grad_norm": 0.887096107006073, + "learning_rate": 8.315277918396275e-05, + "loss": 1.422, + "step": 3338 + }, + { + "epoch": 1.0135073607527698, + "grad_norm": 0.4891083836555481, + "learning_rate": 8.314771691809254e-05, + "loss": 1.7041, + "step": 3339 + }, + { + "epoch": 1.0138108969494612, + "grad_norm": 0.5917662382125854, + "learning_rate": 8.314265465222234e-05, + "loss": 1.519, + "step": 3340 + }, + { + "epoch": 1.0141144331461527, + "grad_norm": 0.4926755726337433, + "learning_rate": 8.313759238635213e-05, + "loss": 1.4773, + "step": 3341 + }, + { + "epoch": 1.0144179693428441, + "grad_norm": 0.42069876194000244, + "learning_rate": 8.313253012048194e-05, + "loss": 1.39, + "step": 3342 + }, + { + "epoch": 1.0147215055395356, + "grad_norm": 0.5121155977249146, + "learning_rate": 8.312746785461173e-05, + "loss": 1.4662, + "step": 3343 + }, + { + "epoch": 1.015025041736227, + "grad_norm": 0.4447721838951111, + "learning_rate": 8.312240558874153e-05, + "loss": 1.5363, + "step": 3344 + }, + { + "epoch": 1.0153285779329184, + "grad_norm": 0.5080471634864807, + "learning_rate": 8.311734332287132e-05, + "loss": 1.6594, + "step": 3345 + }, + { + "epoch": 1.0156321141296099, + "grad_norm": 0.6592669486999512, + "learning_rate": 8.311228105700112e-05, + "loss": 1.9014, + "step": 3346 + }, + { + "epoch": 1.0159356503263015, + "grad_norm": 0.48114287853240967, + "learning_rate": 8.310721879113091e-05, + "loss": 1.4714, + "step": 3347 + }, + { + "epoch": 1.016239186522993, + "grad_norm": 0.4233185350894928, + "learning_rate": 8.310215652526071e-05, + "loss": 1.5716, + "step": 3348 + }, + { + "epoch": 1.0165427227196844, + "grad_norm": 0.41045501828193665, + "learning_rate": 8.30970942593905e-05, + "loss": 1.6364, + "step": 3349 + }, + { + "epoch": 1.0168462589163758, + "grad_norm": 0.48139652609825134, + "learning_rate": 8.30920319935203e-05, + "loss": 1.3519, + "step": 3350 + }, + { + "epoch": 1.0171497951130672, + "grad_norm": 0.908079981803894, + "learning_rate": 8.30869697276501e-05, + "loss": 1.5322, + "step": 3351 + }, + { + "epoch": 1.0174533313097587, + "grad_norm": 0.6833621859550476, + "learning_rate": 8.30819074617799e-05, + "loss": 1.8607, + "step": 3352 + }, + { + "epoch": 1.0177568675064501, + "grad_norm": 0.40126875042915344, + "learning_rate": 8.30768451959097e-05, + "loss": 1.7857, + "step": 3353 + }, + { + "epoch": 1.0180604037031415, + "grad_norm": 0.5128709077835083, + "learning_rate": 8.307178293003949e-05, + "loss": 1.5926, + "step": 3354 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.37055882811546326, + "learning_rate": 8.306672066416928e-05, + "loss": 1.3967, + "step": 3355 + }, + { + "epoch": 1.0186674760965244, + "grad_norm": 0.4289558529853821, + "learning_rate": 8.306165839829908e-05, + "loss": 1.5605, + "step": 3356 + }, + { + "epoch": 1.018971012293216, + "grad_norm": 0.42714962363243103, + "learning_rate": 8.305659613242887e-05, + "loss": 1.4025, + "step": 3357 + }, + { + "epoch": 1.0192745484899075, + "grad_norm": 0.5124452114105225, + "learning_rate": 8.305153386655867e-05, + "loss": 1.6768, + "step": 3358 + }, + { + "epoch": 1.019578084686599, + "grad_norm": 0.5419396758079529, + "learning_rate": 8.304647160068846e-05, + "loss": 1.918, + "step": 3359 + }, + { + "epoch": 1.0198816208832904, + "grad_norm": 0.4802889823913574, + "learning_rate": 8.304140933481827e-05, + "loss": 1.6741, + "step": 3360 + }, + { + "epoch": 1.0201851570799818, + "grad_norm": 0.4368225038051605, + "learning_rate": 8.303634706894807e-05, + "loss": 1.7055, + "step": 3361 + }, + { + "epoch": 1.0204886932766732, + "grad_norm": 0.42357340455055237, + "learning_rate": 8.303128480307786e-05, + "loss": 1.5149, + "step": 3362 + }, + { + "epoch": 1.0207922294733647, + "grad_norm": 0.49789273738861084, + "learning_rate": 8.302622253720766e-05, + "loss": 1.0468, + "step": 3363 + }, + { + "epoch": 1.021095765670056, + "grad_norm": 0.4446631073951721, + "learning_rate": 8.302116027133745e-05, + "loss": 1.9386, + "step": 3364 + }, + { + "epoch": 1.0213993018667475, + "grad_norm": 0.47823619842529297, + "learning_rate": 8.301609800546725e-05, + "loss": 1.6925, + "step": 3365 + }, + { + "epoch": 1.021702838063439, + "grad_norm": 0.44988545775413513, + "learning_rate": 8.301103573959704e-05, + "loss": 1.7536, + "step": 3366 + }, + { + "epoch": 1.0220063742601304, + "grad_norm": 0.5718437433242798, + "learning_rate": 8.300597347372684e-05, + "loss": 1.9711, + "step": 3367 + }, + { + "epoch": 1.022309910456822, + "grad_norm": 0.6239888072013855, + "learning_rate": 8.300091120785663e-05, + "loss": 1.2131, + "step": 3368 + }, + { + "epoch": 1.0226134466535135, + "grad_norm": 0.3814062774181366, + "learning_rate": 8.299584894198643e-05, + "loss": 1.7703, + "step": 3369 + }, + { + "epoch": 1.022916982850205, + "grad_norm": 0.5032787322998047, + "learning_rate": 8.299078667611623e-05, + "loss": 1.5713, + "step": 3370 + }, + { + "epoch": 1.0232205190468964, + "grad_norm": 0.4844599664211273, + "learning_rate": 8.298572441024603e-05, + "loss": 2.0, + "step": 3371 + }, + { + "epoch": 1.0235240552435878, + "grad_norm": 0.4193565249443054, + "learning_rate": 8.298066214437584e-05, + "loss": 1.7529, + "step": 3372 + }, + { + "epoch": 1.0238275914402792, + "grad_norm": 0.46902111172676086, + "learning_rate": 8.297559987850563e-05, + "loss": 1.6534, + "step": 3373 + }, + { + "epoch": 1.0241311276369707, + "grad_norm": 0.3767467141151428, + "learning_rate": 8.297053761263543e-05, + "loss": 1.2065, + "step": 3374 + }, + { + "epoch": 1.024434663833662, + "grad_norm": 0.4703841209411621, + "learning_rate": 8.296547534676522e-05, + "loss": 1.3887, + "step": 3375 + }, + { + "epoch": 1.0247382000303535, + "grad_norm": 0.4988289177417755, + "learning_rate": 8.296041308089502e-05, + "loss": 1.7307, + "step": 3376 + }, + { + "epoch": 1.025041736227045, + "grad_norm": 0.5288352370262146, + "learning_rate": 8.295535081502481e-05, + "loss": 1.1543, + "step": 3377 + }, + { + "epoch": 1.0253452724237366, + "grad_norm": 0.44995805621147156, + "learning_rate": 8.29502885491546e-05, + "loss": 1.7502, + "step": 3378 + }, + { + "epoch": 1.025648808620428, + "grad_norm": 0.5098589062690735, + "learning_rate": 8.29452262832844e-05, + "loss": 1.3828, + "step": 3379 + }, + { + "epoch": 1.0259523448171195, + "grad_norm": 0.44407930970191956, + "learning_rate": 8.29401640174142e-05, + "loss": 1.6914, + "step": 3380 + }, + { + "epoch": 1.026255881013811, + "grad_norm": 0.5067889094352722, + "learning_rate": 8.2935101751544e-05, + "loss": 1.5762, + "step": 3381 + }, + { + "epoch": 1.0265594172105024, + "grad_norm": 0.554493248462677, + "learning_rate": 8.29300394856738e-05, + "loss": 1.5306, + "step": 3382 + }, + { + "epoch": 1.0268629534071938, + "grad_norm": 0.4484996497631073, + "learning_rate": 8.292497721980359e-05, + "loss": 1.8593, + "step": 3383 + }, + { + "epoch": 1.0271664896038852, + "grad_norm": 0.44968825578689575, + "learning_rate": 8.291991495393339e-05, + "loss": 1.5937, + "step": 3384 + }, + { + "epoch": 1.0274700258005767, + "grad_norm": 0.5322727560997009, + "learning_rate": 8.291485268806318e-05, + "loss": 1.6408, + "step": 3385 + }, + { + "epoch": 1.027773561997268, + "grad_norm": 0.47786515951156616, + "learning_rate": 8.290979042219298e-05, + "loss": 1.7686, + "step": 3386 + }, + { + "epoch": 1.0280770981939595, + "grad_norm": 0.5193067193031311, + "learning_rate": 8.290472815632277e-05, + "loss": 1.7331, + "step": 3387 + }, + { + "epoch": 1.0283806343906512, + "grad_norm": 0.4540434181690216, + "learning_rate": 8.289966589045257e-05, + "loss": 1.9388, + "step": 3388 + }, + { + "epoch": 1.0286841705873426, + "grad_norm": 0.46607738733291626, + "learning_rate": 8.289460362458236e-05, + "loss": 1.3198, + "step": 3389 + }, + { + "epoch": 1.028987706784034, + "grad_norm": 0.45862218737602234, + "learning_rate": 8.288954135871217e-05, + "loss": 1.8702, + "step": 3390 + }, + { + "epoch": 1.0292912429807255, + "grad_norm": 0.427997887134552, + "learning_rate": 8.288447909284196e-05, + "loss": 1.6277, + "step": 3391 + }, + { + "epoch": 1.029594779177417, + "grad_norm": 0.5143392086029053, + "learning_rate": 8.287941682697176e-05, + "loss": 1.6736, + "step": 3392 + }, + { + "epoch": 1.0298983153741084, + "grad_norm": 0.42483991384506226, + "learning_rate": 8.287435456110155e-05, + "loss": 1.7603, + "step": 3393 + }, + { + "epoch": 1.0302018515707998, + "grad_norm": 0.5088010430335999, + "learning_rate": 8.286929229523135e-05, + "loss": 1.7789, + "step": 3394 + }, + { + "epoch": 1.0305053877674912, + "grad_norm": 0.4327448904514313, + "learning_rate": 8.286423002936114e-05, + "loss": 1.8521, + "step": 3395 + }, + { + "epoch": 1.0308089239641827, + "grad_norm": 0.5049906969070435, + "learning_rate": 8.285916776349094e-05, + "loss": 1.0102, + "step": 3396 + }, + { + "epoch": 1.031112460160874, + "grad_norm": 0.5136983394622803, + "learning_rate": 8.285410549762073e-05, + "loss": 1.7249, + "step": 3397 + }, + { + "epoch": 1.0314159963575655, + "grad_norm": 0.6552875638008118, + "learning_rate": 8.284904323175053e-05, + "loss": 1.9925, + "step": 3398 + }, + { + "epoch": 1.0317195325542572, + "grad_norm": 0.5211325883865356, + "learning_rate": 8.284398096588032e-05, + "loss": 1.4414, + "step": 3399 + }, + { + "epoch": 1.0320230687509486, + "grad_norm": 0.5152223110198975, + "learning_rate": 8.283891870001013e-05, + "loss": 1.6676, + "step": 3400 + }, + { + "epoch": 1.03232660494764, + "grad_norm": 0.5396007299423218, + "learning_rate": 8.283385643413993e-05, + "loss": 1.5183, + "step": 3401 + }, + { + "epoch": 1.0326301411443315, + "grad_norm": 0.40341633558273315, + "learning_rate": 8.282879416826972e-05, + "loss": 1.5223, + "step": 3402 + }, + { + "epoch": 1.032933677341023, + "grad_norm": 1.1678493022918701, + "learning_rate": 8.282373190239952e-05, + "loss": 1.5059, + "step": 3403 + }, + { + "epoch": 1.0332372135377144, + "grad_norm": 0.48273664712905884, + "learning_rate": 8.281866963652931e-05, + "loss": 1.7899, + "step": 3404 + }, + { + "epoch": 1.0335407497344058, + "grad_norm": 0.5220130085945129, + "learning_rate": 8.28136073706591e-05, + "loss": 1.7364, + "step": 3405 + }, + { + "epoch": 1.0338442859310972, + "grad_norm": 0.6939902901649475, + "learning_rate": 8.28085451047889e-05, + "loss": 1.4888, + "step": 3406 + }, + { + "epoch": 1.0341478221277887, + "grad_norm": 0.48666536808013916, + "learning_rate": 8.28034828389187e-05, + "loss": 1.7026, + "step": 3407 + }, + { + "epoch": 1.03445135832448, + "grad_norm": 0.44669362902641296, + "learning_rate": 8.279842057304849e-05, + "loss": 1.8834, + "step": 3408 + }, + { + "epoch": 1.0347548945211718, + "grad_norm": 0.49153947830200195, + "learning_rate": 8.27933583071783e-05, + "loss": 1.8004, + "step": 3409 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.512179970741272, + "learning_rate": 8.278829604130809e-05, + "loss": 1.456, + "step": 3410 + }, + { + "epoch": 1.0353619669145546, + "grad_norm": 0.4923495054244995, + "learning_rate": 8.278323377543789e-05, + "loss": 1.862, + "step": 3411 + }, + { + "epoch": 1.035665503111246, + "grad_norm": 0.4329081177711487, + "learning_rate": 8.277817150956768e-05, + "loss": 1.567, + "step": 3412 + }, + { + "epoch": 1.0359690393079375, + "grad_norm": 0.43733224272727966, + "learning_rate": 8.277310924369748e-05, + "loss": 1.1695, + "step": 3413 + }, + { + "epoch": 1.036272575504629, + "grad_norm": 0.5006493330001831, + "learning_rate": 8.276804697782729e-05, + "loss": 1.7505, + "step": 3414 + }, + { + "epoch": 1.0365761117013204, + "grad_norm": 0.5372819304466248, + "learning_rate": 8.276298471195708e-05, + "loss": 1.4789, + "step": 3415 + }, + { + "epoch": 1.0368796478980118, + "grad_norm": 0.5383666753768921, + "learning_rate": 8.275792244608688e-05, + "loss": 2.0097, + "step": 3416 + }, + { + "epoch": 1.0371831840947032, + "grad_norm": 0.4749530851840973, + "learning_rate": 8.275286018021667e-05, + "loss": 1.9549, + "step": 3417 + }, + { + "epoch": 1.0374867202913947, + "grad_norm": 0.4715706408023834, + "learning_rate": 8.274779791434647e-05, + "loss": 1.4, + "step": 3418 + }, + { + "epoch": 1.037790256488086, + "grad_norm": 0.5556216239929199, + "learning_rate": 8.274273564847626e-05, + "loss": 1.4848, + "step": 3419 + }, + { + "epoch": 1.0380937926847777, + "grad_norm": 0.46504640579223633, + "learning_rate": 8.273767338260607e-05, + "loss": 1.4732, + "step": 3420 + }, + { + "epoch": 1.0383973288814692, + "grad_norm": 0.5332744121551514, + "learning_rate": 8.273261111673586e-05, + "loss": 1.8816, + "step": 3421 + }, + { + "epoch": 1.0387008650781606, + "grad_norm": 0.5160506963729858, + "learning_rate": 8.272754885086566e-05, + "loss": 1.5866, + "step": 3422 + }, + { + "epoch": 1.039004401274852, + "grad_norm": 0.47710439562797546, + "learning_rate": 8.272248658499545e-05, + "loss": 1.8447, + "step": 3423 + }, + { + "epoch": 1.0393079374715435, + "grad_norm": 0.5172427892684937, + "learning_rate": 8.271742431912525e-05, + "loss": 1.7375, + "step": 3424 + }, + { + "epoch": 1.039611473668235, + "grad_norm": 0.47947341203689575, + "learning_rate": 8.271236205325504e-05, + "loss": 1.7389, + "step": 3425 + }, + { + "epoch": 1.0399150098649264, + "grad_norm": 0.49502432346343994, + "learning_rate": 8.270729978738484e-05, + "loss": 1.2841, + "step": 3426 + }, + { + "epoch": 1.0402185460616178, + "grad_norm": 0.3829929828643799, + "learning_rate": 8.270223752151463e-05, + "loss": 1.3311, + "step": 3427 + }, + { + "epoch": 1.0405220822583092, + "grad_norm": 0.4519781470298767, + "learning_rate": 8.269717525564443e-05, + "loss": 1.6033, + "step": 3428 + }, + { + "epoch": 1.0408256184550007, + "grad_norm": 0.5434120893478394, + "learning_rate": 8.269211298977423e-05, + "loss": 1.7751, + "step": 3429 + }, + { + "epoch": 1.0411291546516923, + "grad_norm": 0.47064414620399475, + "learning_rate": 8.268705072390403e-05, + "loss": 1.5697, + "step": 3430 + }, + { + "epoch": 1.0414326908483837, + "grad_norm": 0.4496542811393738, + "learning_rate": 8.268198845803382e-05, + "loss": 1.8201, + "step": 3431 + }, + { + "epoch": 1.0417362270450752, + "grad_norm": 0.5103307962417603, + "learning_rate": 8.267692619216362e-05, + "loss": 1.6933, + "step": 3432 + }, + { + "epoch": 1.0420397632417666, + "grad_norm": 0.46422064304351807, + "learning_rate": 8.267186392629341e-05, + "loss": 1.7127, + "step": 3433 + }, + { + "epoch": 1.042343299438458, + "grad_norm": 0.4085420072078705, + "learning_rate": 8.266680166042321e-05, + "loss": 1.9025, + "step": 3434 + }, + { + "epoch": 1.0426468356351495, + "grad_norm": 0.5440680980682373, + "learning_rate": 8.2661739394553e-05, + "loss": 1.7786, + "step": 3435 + }, + { + "epoch": 1.042950371831841, + "grad_norm": 0.4835658669471741, + "learning_rate": 8.26566771286828e-05, + "loss": 1.8355, + "step": 3436 + }, + { + "epoch": 1.0432539080285324, + "grad_norm": 0.45734959840774536, + "learning_rate": 8.26516148628126e-05, + "loss": 1.225, + "step": 3437 + }, + { + "epoch": 1.0435574442252238, + "grad_norm": 0.49322158098220825, + "learning_rate": 8.264655259694239e-05, + "loss": 1.7176, + "step": 3438 + }, + { + "epoch": 1.0438609804219152, + "grad_norm": 0.44236719608306885, + "learning_rate": 8.26414903310722e-05, + "loss": 1.8368, + "step": 3439 + }, + { + "epoch": 1.0441645166186069, + "grad_norm": 0.5147215127944946, + "learning_rate": 8.263642806520199e-05, + "loss": 1.7734, + "step": 3440 + }, + { + "epoch": 1.0444680528152983, + "grad_norm": 0.4684102535247803, + "learning_rate": 8.263136579933179e-05, + "loss": 1.6995, + "step": 3441 + }, + { + "epoch": 1.0447715890119897, + "grad_norm": 0.4718666076660156, + "learning_rate": 8.262630353346158e-05, + "loss": 1.8043, + "step": 3442 + }, + { + "epoch": 1.0450751252086812, + "grad_norm": 0.5067316293716431, + "learning_rate": 8.262124126759138e-05, + "loss": 1.5676, + "step": 3443 + }, + { + "epoch": 1.0453786614053726, + "grad_norm": 0.5173615217208862, + "learning_rate": 8.261617900172117e-05, + "loss": 1.5209, + "step": 3444 + }, + { + "epoch": 1.045682197602064, + "grad_norm": 0.505668044090271, + "learning_rate": 8.261111673585097e-05, + "loss": 1.7613, + "step": 3445 + }, + { + "epoch": 1.0459857337987555, + "grad_norm": 0.39871707558631897, + "learning_rate": 8.260605446998076e-05, + "loss": 1.2263, + "step": 3446 + }, + { + "epoch": 1.046289269995447, + "grad_norm": 0.46265268325805664, + "learning_rate": 8.260099220411055e-05, + "loss": 1.5503, + "step": 3447 + }, + { + "epoch": 1.0465928061921383, + "grad_norm": 0.4678381681442261, + "learning_rate": 8.259592993824036e-05, + "loss": 1.4896, + "step": 3448 + }, + { + "epoch": 1.0468963423888298, + "grad_norm": 0.4106181263923645, + "learning_rate": 8.259086767237016e-05, + "loss": 1.6483, + "step": 3449 + }, + { + "epoch": 1.0471998785855212, + "grad_norm": 0.5662809014320374, + "learning_rate": 8.258580540649995e-05, + "loss": 1.7384, + "step": 3450 + }, + { + "epoch": 1.0475034147822129, + "grad_norm": 0.5098707675933838, + "learning_rate": 8.258074314062975e-05, + "loss": 1.6576, + "step": 3451 + }, + { + "epoch": 1.0478069509789043, + "grad_norm": 0.5266808867454529, + "learning_rate": 8.257568087475954e-05, + "loss": 1.7297, + "step": 3452 + }, + { + "epoch": 1.0481104871755957, + "grad_norm": 0.4747079014778137, + "learning_rate": 8.257061860888934e-05, + "loss": 1.9322, + "step": 3453 + }, + { + "epoch": 1.0484140233722872, + "grad_norm": 0.5059680938720703, + "learning_rate": 8.256555634301913e-05, + "loss": 1.3994, + "step": 3454 + }, + { + "epoch": 1.0487175595689786, + "grad_norm": 0.42003318667411804, + "learning_rate": 8.256049407714893e-05, + "loss": 1.9084, + "step": 3455 + }, + { + "epoch": 1.04902109576567, + "grad_norm": 0.5502803325653076, + "learning_rate": 8.255543181127872e-05, + "loss": 1.7448, + "step": 3456 + }, + { + "epoch": 1.0493246319623615, + "grad_norm": 9.822343826293945, + "learning_rate": 8.255036954540853e-05, + "loss": 1.7656, + "step": 3457 + }, + { + "epoch": 1.049628168159053, + "grad_norm": 0.45939022302627563, + "learning_rate": 8.254530727953832e-05, + "loss": 1.5728, + "step": 3458 + }, + { + "epoch": 1.0499317043557443, + "grad_norm": 0.45364564657211304, + "learning_rate": 8.254024501366813e-05, + "loss": 1.6829, + "step": 3459 + }, + { + "epoch": 1.0502352405524358, + "grad_norm": 0.5563533902168274, + "learning_rate": 8.253518274779793e-05, + "loss": 1.4829, + "step": 3460 + }, + { + "epoch": 1.0505387767491274, + "grad_norm": 0.5119985342025757, + "learning_rate": 8.253012048192772e-05, + "loss": 1.4512, + "step": 3461 + }, + { + "epoch": 1.0508423129458189, + "grad_norm": 0.4572368562221527, + "learning_rate": 8.252505821605752e-05, + "loss": 1.0341, + "step": 3462 + }, + { + "epoch": 1.0511458491425103, + "grad_norm": 0.5076282024383545, + "learning_rate": 8.251999595018731e-05, + "loss": 1.6044, + "step": 3463 + }, + { + "epoch": 1.0514493853392017, + "grad_norm": 0.6191340088844299, + "learning_rate": 8.251493368431711e-05, + "loss": 1.1313, + "step": 3464 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.5099151730537415, + "learning_rate": 8.25098714184469e-05, + "loss": 1.8197, + "step": 3465 + }, + { + "epoch": 1.0520564577325846, + "grad_norm": 0.5666268467903137, + "learning_rate": 8.25048091525767e-05, + "loss": 1.5068, + "step": 3466 + }, + { + "epoch": 1.052359993929276, + "grad_norm": 0.6376385688781738, + "learning_rate": 8.249974688670649e-05, + "loss": 1.9853, + "step": 3467 + }, + { + "epoch": 1.0526635301259675, + "grad_norm": 0.47871848940849304, + "learning_rate": 8.24946846208363e-05, + "loss": 1.6678, + "step": 3468 + }, + { + "epoch": 1.052967066322659, + "grad_norm": 0.4443090856075287, + "learning_rate": 8.24896223549661e-05, + "loss": 1.7854, + "step": 3469 + }, + { + "epoch": 1.0532706025193503, + "grad_norm": 0.4864484667778015, + "learning_rate": 8.248456008909589e-05, + "loss": 1.7076, + "step": 3470 + }, + { + "epoch": 1.053574138716042, + "grad_norm": 0.5868439078330994, + "learning_rate": 8.247949782322568e-05, + "loss": 1.4256, + "step": 3471 + }, + { + "epoch": 1.0538776749127334, + "grad_norm": 0.47932010889053345, + "learning_rate": 8.247443555735548e-05, + "loss": 1.7303, + "step": 3472 + }, + { + "epoch": 1.0541812111094249, + "grad_norm": 0.44566673040390015, + "learning_rate": 8.246937329148527e-05, + "loss": 1.5097, + "step": 3473 + }, + { + "epoch": 1.0544847473061163, + "grad_norm": 0.43415167927742004, + "learning_rate": 8.246431102561507e-05, + "loss": 1.5726, + "step": 3474 + }, + { + "epoch": 1.0547882835028077, + "grad_norm": 0.5519189834594727, + "learning_rate": 8.245924875974486e-05, + "loss": 1.9481, + "step": 3475 + }, + { + "epoch": 1.0550918196994992, + "grad_norm": 0.5307170152664185, + "learning_rate": 8.245418649387466e-05, + "loss": 1.1548, + "step": 3476 + }, + { + "epoch": 1.0553953558961906, + "grad_norm": 0.46552446484565735, + "learning_rate": 8.244912422800445e-05, + "loss": 1.5465, + "step": 3477 + }, + { + "epoch": 1.055698892092882, + "grad_norm": 0.45253750681877136, + "learning_rate": 8.244406196213426e-05, + "loss": 1.4566, + "step": 3478 + }, + { + "epoch": 1.0560024282895735, + "grad_norm": 0.49360036849975586, + "learning_rate": 8.243899969626406e-05, + "loss": 1.7727, + "step": 3479 + }, + { + "epoch": 1.056305964486265, + "grad_norm": 0.5780231356620789, + "learning_rate": 8.243393743039385e-05, + "loss": 1.7624, + "step": 3480 + }, + { + "epoch": 1.0566095006829563, + "grad_norm": 0.5073447227478027, + "learning_rate": 8.242887516452365e-05, + "loss": 1.087, + "step": 3481 + }, + { + "epoch": 1.056913036879648, + "grad_norm": 0.48128485679626465, + "learning_rate": 8.242381289865344e-05, + "loss": 1.5136, + "step": 3482 + }, + { + "epoch": 1.0572165730763394, + "grad_norm": 0.4653654992580414, + "learning_rate": 8.241875063278324e-05, + "loss": 1.757, + "step": 3483 + }, + { + "epoch": 1.0575201092730309, + "grad_norm": 0.6654927730560303, + "learning_rate": 8.241368836691303e-05, + "loss": 1.3386, + "step": 3484 + }, + { + "epoch": 1.0578236454697223, + "grad_norm": 4.631044387817383, + "learning_rate": 8.240862610104282e-05, + "loss": 1.4554, + "step": 3485 + }, + { + "epoch": 1.0581271816664137, + "grad_norm": 0.39119642972946167, + "learning_rate": 8.240356383517262e-05, + "loss": 2.0105, + "step": 3486 + }, + { + "epoch": 1.0584307178631052, + "grad_norm": 0.4815531075000763, + "learning_rate": 8.239850156930243e-05, + "loss": 1.4448, + "step": 3487 + }, + { + "epoch": 1.0587342540597966, + "grad_norm": 0.6337845921516418, + "learning_rate": 8.239343930343222e-05, + "loss": 1.1668, + "step": 3488 + }, + { + "epoch": 1.059037790256488, + "grad_norm": 0.49312466382980347, + "learning_rate": 8.238837703756202e-05, + "loss": 1.0456, + "step": 3489 + }, + { + "epoch": 1.0593413264531795, + "grad_norm": 0.7493519186973572, + "learning_rate": 8.238331477169181e-05, + "loss": 1.1351, + "step": 3490 + }, + { + "epoch": 1.059644862649871, + "grad_norm": 0.46115273237228394, + "learning_rate": 8.237825250582161e-05, + "loss": 1.662, + "step": 3491 + }, + { + "epoch": 1.0599483988465626, + "grad_norm": 0.5314909815788269, + "learning_rate": 8.23731902399514e-05, + "loss": 1.93, + "step": 3492 + }, + { + "epoch": 1.060251935043254, + "grad_norm": 0.4482044279575348, + "learning_rate": 8.23681279740812e-05, + "loss": 1.5613, + "step": 3493 + }, + { + "epoch": 1.0605554712399454, + "grad_norm": 0.4885779321193695, + "learning_rate": 8.236306570821099e-05, + "loss": 0.8249, + "step": 3494 + }, + { + "epoch": 1.0608590074366369, + "grad_norm": 0.49470406770706177, + "learning_rate": 8.235800344234079e-05, + "loss": 1.2612, + "step": 3495 + }, + { + "epoch": 1.0611625436333283, + "grad_norm": 0.5256443023681641, + "learning_rate": 8.23529411764706e-05, + "loss": 1.726, + "step": 3496 + }, + { + "epoch": 1.0614660798300197, + "grad_norm": 0.45549553632736206, + "learning_rate": 8.234787891060039e-05, + "loss": 1.6308, + "step": 3497 + }, + { + "epoch": 1.0617696160267112, + "grad_norm": 0.45309826731681824, + "learning_rate": 8.234281664473018e-05, + "loss": 1.6844, + "step": 3498 + }, + { + "epoch": 1.0620731522234026, + "grad_norm": 3.0413029193878174, + "learning_rate": 8.233775437885998e-05, + "loss": 1.5278, + "step": 3499 + }, + { + "epoch": 1.062376688420094, + "grad_norm": 0.5270426869392395, + "learning_rate": 8.233269211298977e-05, + "loss": 1.9033, + "step": 3500 + }, + { + "epoch": 1.0626802246167855, + "grad_norm": 0.5388810634613037, + "learning_rate": 8.232762984711957e-05, + "loss": 1.4571, + "step": 3501 + }, + { + "epoch": 1.0629837608134771, + "grad_norm": 0.5315312743186951, + "learning_rate": 8.232256758124936e-05, + "loss": 1.7535, + "step": 3502 + }, + { + "epoch": 1.0632872970101686, + "grad_norm": 0.530817985534668, + "learning_rate": 8.231750531537917e-05, + "loss": 1.3184, + "step": 3503 + }, + { + "epoch": 1.06359083320686, + "grad_norm": 0.4437430500984192, + "learning_rate": 8.231244304950897e-05, + "loss": 1.6532, + "step": 3504 + }, + { + "epoch": 1.0638943694035514, + "grad_norm": 0.5160355567932129, + "learning_rate": 8.230738078363876e-05, + "loss": 1.6429, + "step": 3505 + }, + { + "epoch": 1.0641979056002429, + "grad_norm": 0.5177229046821594, + "learning_rate": 8.230231851776856e-05, + "loss": 1.3577, + "step": 3506 + }, + { + "epoch": 1.0645014417969343, + "grad_norm": 0.5815109014511108, + "learning_rate": 8.229725625189836e-05, + "loss": 1.3545, + "step": 3507 + }, + { + "epoch": 1.0648049779936257, + "grad_norm": 0.7150763273239136, + "learning_rate": 8.229219398602816e-05, + "loss": 1.5478, + "step": 3508 + }, + { + "epoch": 1.0651085141903172, + "grad_norm": 0.5490028262138367, + "learning_rate": 8.228713172015795e-05, + "loss": 1.8155, + "step": 3509 + }, + { + "epoch": 1.0654120503870086, + "grad_norm": 0.455064982175827, + "learning_rate": 8.228206945428775e-05, + "loss": 1.4917, + "step": 3510 + }, + { + "epoch": 1.0657155865837, + "grad_norm": 1.305898666381836, + "learning_rate": 8.227700718841754e-05, + "loss": 1.6654, + "step": 3511 + }, + { + "epoch": 1.0660191227803915, + "grad_norm": 0.4300096929073334, + "learning_rate": 8.227194492254734e-05, + "loss": 1.8274, + "step": 3512 + }, + { + "epoch": 1.0663226589770831, + "grad_norm": 0.3954041600227356, + "learning_rate": 8.226688265667713e-05, + "loss": 1.537, + "step": 3513 + }, + { + "epoch": 1.0666261951737745, + "grad_norm": 0.48388218879699707, + "learning_rate": 8.226182039080693e-05, + "loss": 1.8136, + "step": 3514 + }, + { + "epoch": 1.066929731370466, + "grad_norm": 0.5675212740898132, + "learning_rate": 8.225675812493672e-05, + "loss": 1.7286, + "step": 3515 + }, + { + "epoch": 1.0672332675671574, + "grad_norm": 0.4250033497810364, + "learning_rate": 8.225169585906652e-05, + "loss": 1.6428, + "step": 3516 + }, + { + "epoch": 1.0675368037638489, + "grad_norm": 0.41837701201438904, + "learning_rate": 8.224663359319633e-05, + "loss": 1.0005, + "step": 3517 + }, + { + "epoch": 1.0678403399605403, + "grad_norm": 0.6654266119003296, + "learning_rate": 8.224157132732612e-05, + "loss": 1.5319, + "step": 3518 + }, + { + "epoch": 1.0681438761572317, + "grad_norm": 0.5207446813583374, + "learning_rate": 8.223650906145592e-05, + "loss": 1.7826, + "step": 3519 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.5787805914878845, + "learning_rate": 8.223144679558571e-05, + "loss": 1.6859, + "step": 3520 + }, + { + "epoch": 1.0687509485506146, + "grad_norm": 0.5311471819877625, + "learning_rate": 8.22263845297155e-05, + "loss": 1.9176, + "step": 3521 + }, + { + "epoch": 1.069054484747306, + "grad_norm": 0.5248377323150635, + "learning_rate": 8.22213222638453e-05, + "loss": 1.8143, + "step": 3522 + }, + { + "epoch": 1.0693580209439975, + "grad_norm": 0.48425552248954773, + "learning_rate": 8.22162599979751e-05, + "loss": 1.8119, + "step": 3523 + }, + { + "epoch": 1.0696615571406891, + "grad_norm": 0.49413445591926575, + "learning_rate": 8.221119773210489e-05, + "loss": 1.2554, + "step": 3524 + }, + { + "epoch": 1.0699650933373805, + "grad_norm": 0.48792800307273865, + "learning_rate": 8.220613546623468e-05, + "loss": 1.7597, + "step": 3525 + }, + { + "epoch": 1.070268629534072, + "grad_norm": 0.4943905174732208, + "learning_rate": 8.220107320036449e-05, + "loss": 0.9761, + "step": 3526 + }, + { + "epoch": 1.0705721657307634, + "grad_norm": 0.6244659423828125, + "learning_rate": 8.219601093449429e-05, + "loss": 0.7937, + "step": 3527 + }, + { + "epoch": 1.0708757019274548, + "grad_norm": 0.5777722001075745, + "learning_rate": 8.219094866862408e-05, + "loss": 1.2079, + "step": 3528 + }, + { + "epoch": 1.0711792381241463, + "grad_norm": 0.4799225926399231, + "learning_rate": 8.218588640275388e-05, + "loss": 1.3656, + "step": 3529 + }, + { + "epoch": 1.0714827743208377, + "grad_norm": 0.48858943581581116, + "learning_rate": 8.218082413688367e-05, + "loss": 1.6026, + "step": 3530 + }, + { + "epoch": 1.0717863105175292, + "grad_norm": 0.532616376876831, + "learning_rate": 8.217576187101347e-05, + "loss": 1.3968, + "step": 3531 + }, + { + "epoch": 1.0720898467142206, + "grad_norm": 0.4278903603553772, + "learning_rate": 8.217069960514326e-05, + "loss": 1.1862, + "step": 3532 + }, + { + "epoch": 1.0723933829109122, + "grad_norm": 0.47585421800613403, + "learning_rate": 8.216563733927306e-05, + "loss": 1.4731, + "step": 3533 + }, + { + "epoch": 1.0726969191076037, + "grad_norm": 0.5400151014328003, + "learning_rate": 8.216057507340285e-05, + "loss": 1.9027, + "step": 3534 + }, + { + "epoch": 1.073000455304295, + "grad_norm": 0.4817771911621094, + "learning_rate": 8.215551280753266e-05, + "loss": 1.7373, + "step": 3535 + }, + { + "epoch": 1.0733039915009865, + "grad_norm": 0.46320927143096924, + "learning_rate": 8.215045054166245e-05, + "loss": 1.691, + "step": 3536 + }, + { + "epoch": 1.073607527697678, + "grad_norm": 0.5020293593406677, + "learning_rate": 8.214538827579225e-05, + "loss": 1.8651, + "step": 3537 + }, + { + "epoch": 1.0739110638943694, + "grad_norm": 0.5326499342918396, + "learning_rate": 8.214032600992204e-05, + "loss": 1.5852, + "step": 3538 + }, + { + "epoch": 1.0742146000910608, + "grad_norm": 0.44219890236854553, + "learning_rate": 8.213526374405184e-05, + "loss": 1.7222, + "step": 3539 + }, + { + "epoch": 1.0745181362877523, + "grad_norm": 0.5475308895111084, + "learning_rate": 8.213020147818163e-05, + "loss": 1.3513, + "step": 3540 + }, + { + "epoch": 1.0748216724844437, + "grad_norm": 0.4063558876514435, + "learning_rate": 8.212513921231143e-05, + "loss": 1.5747, + "step": 3541 + }, + { + "epoch": 1.0751252086811351, + "grad_norm": 0.49913567304611206, + "learning_rate": 8.212007694644122e-05, + "loss": 1.8541, + "step": 3542 + }, + { + "epoch": 1.0754287448778266, + "grad_norm": 0.5605202913284302, + "learning_rate": 8.211501468057102e-05, + "loss": 1.5137, + "step": 3543 + }, + { + "epoch": 1.0757322810745182, + "grad_norm": 0.8763415813446045, + "learning_rate": 8.210995241470081e-05, + "loss": 1.3439, + "step": 3544 + }, + { + "epoch": 1.0760358172712097, + "grad_norm": 0.5468719601631165, + "learning_rate": 8.210489014883062e-05, + "loss": 1.8425, + "step": 3545 + }, + { + "epoch": 1.076339353467901, + "grad_norm": 0.6741669774055481, + "learning_rate": 8.209982788296042e-05, + "loss": 1.5244, + "step": 3546 + }, + { + "epoch": 1.0766428896645925, + "grad_norm": 0.4905114769935608, + "learning_rate": 8.209476561709021e-05, + "loss": 1.3453, + "step": 3547 + }, + { + "epoch": 1.076946425861284, + "grad_norm": 0.4014042615890503, + "learning_rate": 8.208970335122002e-05, + "loss": 1.7543, + "step": 3548 + }, + { + "epoch": 1.0772499620579754, + "grad_norm": 0.41040587425231934, + "learning_rate": 8.208464108534981e-05, + "loss": 1.4491, + "step": 3549 + }, + { + "epoch": 1.0775534982546668, + "grad_norm": 0.558845043182373, + "learning_rate": 8.207957881947961e-05, + "loss": 1.6391, + "step": 3550 + }, + { + "epoch": 1.0778570344513583, + "grad_norm": 0.5031947493553162, + "learning_rate": 8.20745165536094e-05, + "loss": 1.8803, + "step": 3551 + }, + { + "epoch": 1.0781605706480497, + "grad_norm": 0.5727876424789429, + "learning_rate": 8.20694542877392e-05, + "loss": 1.7184, + "step": 3552 + }, + { + "epoch": 1.0784641068447411, + "grad_norm": 0.506584107875824, + "learning_rate": 8.206439202186899e-05, + "loss": 1.6442, + "step": 3553 + }, + { + "epoch": 1.0787676430414326, + "grad_norm": 0.41179177165031433, + "learning_rate": 8.205932975599879e-05, + "loss": 1.8083, + "step": 3554 + }, + { + "epoch": 1.0790711792381242, + "grad_norm": 0.4515683054924011, + "learning_rate": 8.205426749012858e-05, + "loss": 1.675, + "step": 3555 + }, + { + "epoch": 1.0793747154348157, + "grad_norm": 0.7169219851493835, + "learning_rate": 8.204920522425839e-05, + "loss": 1.81, + "step": 3556 + }, + { + "epoch": 1.079678251631507, + "grad_norm": 0.4774874150753021, + "learning_rate": 8.204414295838819e-05, + "loss": 1.8261, + "step": 3557 + }, + { + "epoch": 1.0799817878281985, + "grad_norm": 0.5474801659584045, + "learning_rate": 8.203908069251798e-05, + "loss": 1.7529, + "step": 3558 + }, + { + "epoch": 1.08028532402489, + "grad_norm": 0.49499598145484924, + "learning_rate": 8.203401842664778e-05, + "loss": 1.6019, + "step": 3559 + }, + { + "epoch": 1.0805888602215814, + "grad_norm": 0.5839375257492065, + "learning_rate": 8.202895616077757e-05, + "loss": 1.1219, + "step": 3560 + }, + { + "epoch": 1.0808923964182728, + "grad_norm": 0.5204687714576721, + "learning_rate": 8.202389389490736e-05, + "loss": 2.0522, + "step": 3561 + }, + { + "epoch": 1.0811959326149643, + "grad_norm": 0.5370623469352722, + "learning_rate": 8.201883162903716e-05, + "loss": 1.5759, + "step": 3562 + }, + { + "epoch": 1.0814994688116557, + "grad_norm": 0.49342766404151917, + "learning_rate": 8.201376936316695e-05, + "loss": 1.4948, + "step": 3563 + }, + { + "epoch": 1.0818030050083474, + "grad_norm": 0.4796147644519806, + "learning_rate": 8.200870709729675e-05, + "loss": 2.0453, + "step": 3564 + }, + { + "epoch": 1.0821065412050388, + "grad_norm": 0.41939178109169006, + "learning_rate": 8.200364483142656e-05, + "loss": 1.2251, + "step": 3565 + }, + { + "epoch": 1.0824100774017302, + "grad_norm": 0.5219609141349792, + "learning_rate": 8.199858256555635e-05, + "loss": 1.3768, + "step": 3566 + }, + { + "epoch": 1.0827136135984217, + "grad_norm": 0.45907774567604065, + "learning_rate": 8.199352029968615e-05, + "loss": 1.4058, + "step": 3567 + }, + { + "epoch": 1.083017149795113, + "grad_norm": 0.6365559101104736, + "learning_rate": 8.198845803381594e-05, + "loss": 1.6417, + "step": 3568 + }, + { + "epoch": 1.0833206859918045, + "grad_norm": 0.5077771544456482, + "learning_rate": 8.198339576794574e-05, + "loss": 1.8395, + "step": 3569 + }, + { + "epoch": 1.083624222188496, + "grad_norm": 0.4947681128978729, + "learning_rate": 8.197833350207553e-05, + "loss": 1.6237, + "step": 3570 + }, + { + "epoch": 1.0839277583851874, + "grad_norm": 0.770715057849884, + "learning_rate": 8.197327123620533e-05, + "loss": 0.9829, + "step": 3571 + }, + { + "epoch": 1.0842312945818788, + "grad_norm": 0.5345532298088074, + "learning_rate": 8.196820897033512e-05, + "loss": 1.7476, + "step": 3572 + }, + { + "epoch": 1.0845348307785703, + "grad_norm": 0.5688590407371521, + "learning_rate": 8.196314670446492e-05, + "loss": 1.8579, + "step": 3573 + }, + { + "epoch": 1.0848383669752617, + "grad_norm": 0.5151784420013428, + "learning_rate": 8.195808443859472e-05, + "loss": 1.508, + "step": 3574 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.4951719045639038, + "learning_rate": 8.195302217272452e-05, + "loss": 1.776, + "step": 3575 + }, + { + "epoch": 1.0854454393686448, + "grad_norm": 0.4807814359664917, + "learning_rate": 8.194795990685431e-05, + "loss": 1.7716, + "step": 3576 + }, + { + "epoch": 1.0857489755653362, + "grad_norm": 0.4597873091697693, + "learning_rate": 8.194289764098411e-05, + "loss": 1.7043, + "step": 3577 + }, + { + "epoch": 1.0860525117620277, + "grad_norm": 0.45082420110702515, + "learning_rate": 8.19378353751139e-05, + "loss": 1.8152, + "step": 3578 + }, + { + "epoch": 1.086356047958719, + "grad_norm": 0.5934682488441467, + "learning_rate": 8.19327731092437e-05, + "loss": 1.5072, + "step": 3579 + }, + { + "epoch": 1.0866595841554105, + "grad_norm": 0.496003657579422, + "learning_rate": 8.192771084337349e-05, + "loss": 1.5421, + "step": 3580 + }, + { + "epoch": 1.086963120352102, + "grad_norm": 0.5661733746528625, + "learning_rate": 8.192264857750329e-05, + "loss": 1.3126, + "step": 3581 + }, + { + "epoch": 1.0872666565487934, + "grad_norm": 0.6730456352233887, + "learning_rate": 8.191758631163308e-05, + "loss": 1.5007, + "step": 3582 + }, + { + "epoch": 1.0875701927454848, + "grad_norm": 0.6048880815505981, + "learning_rate": 8.191252404576288e-05, + "loss": 1.7904, + "step": 3583 + }, + { + "epoch": 1.0878737289421763, + "grad_norm": 0.5846467614173889, + "learning_rate": 8.190746177989269e-05, + "loss": 1.824, + "step": 3584 + }, + { + "epoch": 1.0881772651388677, + "grad_norm": 0.47012564539909363, + "learning_rate": 8.190239951402248e-05, + "loss": 1.4203, + "step": 3585 + }, + { + "epoch": 1.0884808013355594, + "grad_norm": 0.5919317007064819, + "learning_rate": 8.189733724815228e-05, + "loss": 1.6285, + "step": 3586 + }, + { + "epoch": 1.0887843375322508, + "grad_norm": 0.4845069944858551, + "learning_rate": 8.189227498228207e-05, + "loss": 1.8242, + "step": 3587 + }, + { + "epoch": 1.0890878737289422, + "grad_norm": 0.6998928785324097, + "learning_rate": 8.188721271641186e-05, + "loss": 1.7961, + "step": 3588 + }, + { + "epoch": 1.0893914099256337, + "grad_norm": 0.5783026814460754, + "learning_rate": 8.188215045054166e-05, + "loss": 1.7251, + "step": 3589 + }, + { + "epoch": 1.089694946122325, + "grad_norm": 0.48840558528900146, + "learning_rate": 8.187708818467145e-05, + "loss": 1.5038, + "step": 3590 + }, + { + "epoch": 1.0899984823190165, + "grad_norm": 0.47411298751831055, + "learning_rate": 8.187202591880125e-05, + "loss": 1.2775, + "step": 3591 + }, + { + "epoch": 1.090302018515708, + "grad_norm": 0.39426228404045105, + "learning_rate": 8.186696365293106e-05, + "loss": 1.6157, + "step": 3592 + }, + { + "epoch": 1.0906055547123994, + "grad_norm": 0.4736963212490082, + "learning_rate": 8.186190138706085e-05, + "loss": 1.7205, + "step": 3593 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.4530097544193268, + "learning_rate": 8.185683912119065e-05, + "loss": 1.4091, + "step": 3594 + }, + { + "epoch": 1.0912126271057823, + "grad_norm": 0.6029506921768188, + "learning_rate": 8.185177685532046e-05, + "loss": 1.5175, + "step": 3595 + }, + { + "epoch": 1.091516163302474, + "grad_norm": 0.5318018198013306, + "learning_rate": 8.184671458945025e-05, + "loss": 1.9868, + "step": 3596 + }, + { + "epoch": 1.0918196994991654, + "grad_norm": 0.515565812587738, + "learning_rate": 8.184165232358005e-05, + "loss": 1.7638, + "step": 3597 + }, + { + "epoch": 1.0921232356958568, + "grad_norm": 0.54291832447052, + "learning_rate": 8.183659005770984e-05, + "loss": 1.5345, + "step": 3598 + }, + { + "epoch": 1.0924267718925482, + "grad_norm": 0.45856529474258423, + "learning_rate": 8.183152779183963e-05, + "loss": 1.6028, + "step": 3599 + }, + { + "epoch": 1.0927303080892397, + "grad_norm": 0.5494308471679688, + "learning_rate": 8.182646552596943e-05, + "loss": 1.743, + "step": 3600 + }, + { + "epoch": 1.093033844285931, + "grad_norm": 0.6626517176628113, + "learning_rate": 8.182140326009922e-05, + "loss": 1.2859, + "step": 3601 + }, + { + "epoch": 1.0933373804826225, + "grad_norm": 0.5031691193580627, + "learning_rate": 8.181634099422902e-05, + "loss": 1.742, + "step": 3602 + }, + { + "epoch": 1.093640916679314, + "grad_norm": 0.4413023889064789, + "learning_rate": 8.181127872835881e-05, + "loss": 1.7242, + "step": 3603 + }, + { + "epoch": 1.0939444528760054, + "grad_norm": 1.5234955549240112, + "learning_rate": 8.180621646248862e-05, + "loss": 1.9161, + "step": 3604 + }, + { + "epoch": 1.0942479890726968, + "grad_norm": 0.4144064784049988, + "learning_rate": 8.180115419661842e-05, + "loss": 1.7456, + "step": 3605 + }, + { + "epoch": 1.0945515252693885, + "grad_norm": 0.39565610885620117, + "learning_rate": 8.179609193074821e-05, + "loss": 1.9448, + "step": 3606 + }, + { + "epoch": 1.09485506146608, + "grad_norm": 0.484910786151886, + "learning_rate": 8.1791029664878e-05, + "loss": 1.8062, + "step": 3607 + }, + { + "epoch": 1.0951585976627713, + "grad_norm": 0.4055248200893402, + "learning_rate": 8.17859673990078e-05, + "loss": 1.7592, + "step": 3608 + }, + { + "epoch": 1.0954621338594628, + "grad_norm": 0.48900482058525085, + "learning_rate": 8.17809051331376e-05, + "loss": 1.7554, + "step": 3609 + }, + { + "epoch": 1.0957656700561542, + "grad_norm": 0.4918522834777832, + "learning_rate": 8.177584286726739e-05, + "loss": 1.3203, + "step": 3610 + }, + { + "epoch": 1.0960692062528457, + "grad_norm": 0.4044332206249237, + "learning_rate": 8.177078060139719e-05, + "loss": 1.1163, + "step": 3611 + }, + { + "epoch": 1.096372742449537, + "grad_norm": 0.5623982548713684, + "learning_rate": 8.176571833552698e-05, + "loss": 1.782, + "step": 3612 + }, + { + "epoch": 1.0966762786462285, + "grad_norm": 0.49447813630104065, + "learning_rate": 8.176065606965679e-05, + "loss": 1.8644, + "step": 3613 + }, + { + "epoch": 1.09697981484292, + "grad_norm": 0.48916736245155334, + "learning_rate": 8.175559380378658e-05, + "loss": 1.8049, + "step": 3614 + }, + { + "epoch": 1.0972833510396114, + "grad_norm": 0.4916020333766937, + "learning_rate": 8.175053153791638e-05, + "loss": 1.5267, + "step": 3615 + }, + { + "epoch": 1.0975868872363028, + "grad_norm": 0.5263971090316772, + "learning_rate": 8.174546927204617e-05, + "loss": 1.8126, + "step": 3616 + }, + { + "epoch": 1.0978904234329945, + "grad_norm": 0.5741640329360962, + "learning_rate": 8.174040700617597e-05, + "loss": 1.2607, + "step": 3617 + }, + { + "epoch": 1.098193959629686, + "grad_norm": 1.0414916276931763, + "learning_rate": 8.173534474030576e-05, + "loss": 1.6205, + "step": 3618 + }, + { + "epoch": 1.0984974958263773, + "grad_norm": 0.494989275932312, + "learning_rate": 8.173028247443556e-05, + "loss": 1.9782, + "step": 3619 + }, + { + "epoch": 1.0988010320230688, + "grad_norm": 0.4781859815120697, + "learning_rate": 8.172522020856535e-05, + "loss": 1.9079, + "step": 3620 + }, + { + "epoch": 1.0991045682197602, + "grad_norm": 0.5632685422897339, + "learning_rate": 8.172015794269515e-05, + "loss": 1.9584, + "step": 3621 + }, + { + "epoch": 1.0994081044164516, + "grad_norm": 0.6196724772453308, + "learning_rate": 8.171509567682494e-05, + "loss": 1.5911, + "step": 3622 + }, + { + "epoch": 1.099711640613143, + "grad_norm": 0.9034321904182434, + "learning_rate": 8.171003341095475e-05, + "loss": 1.5019, + "step": 3623 + }, + { + "epoch": 1.1000151768098345, + "grad_norm": 0.370377779006958, + "learning_rate": 8.170497114508455e-05, + "loss": 1.2399, + "step": 3624 + }, + { + "epoch": 1.100318713006526, + "grad_norm": 0.5086367130279541, + "learning_rate": 8.169990887921434e-05, + "loss": 1.3943, + "step": 3625 + }, + { + "epoch": 1.1006222492032174, + "grad_norm": 0.5467544198036194, + "learning_rate": 8.169484661334413e-05, + "loss": 1.9572, + "step": 3626 + }, + { + "epoch": 1.100925785399909, + "grad_norm": 0.5355213284492493, + "learning_rate": 8.168978434747393e-05, + "loss": 1.5189, + "step": 3627 + }, + { + "epoch": 1.1012293215966005, + "grad_norm": 0.48003602027893066, + "learning_rate": 8.168472208160372e-05, + "loss": 1.7356, + "step": 3628 + }, + { + "epoch": 1.101532857793292, + "grad_norm": 0.5566051006317139, + "learning_rate": 8.167965981573352e-05, + "loss": 1.4201, + "step": 3629 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.504786491394043, + "learning_rate": 8.167459754986331e-05, + "loss": 1.6986, + "step": 3630 + }, + { + "epoch": 1.1021399301866748, + "grad_norm": 0.5683818459510803, + "learning_rate": 8.166953528399311e-05, + "loss": 1.633, + "step": 3631 + }, + { + "epoch": 1.1024434663833662, + "grad_norm": 0.4428302049636841, + "learning_rate": 8.166447301812292e-05, + "loss": 1.715, + "step": 3632 + }, + { + "epoch": 1.1027470025800576, + "grad_norm": 0.5696395635604858, + "learning_rate": 8.165941075225271e-05, + "loss": 1.0826, + "step": 3633 + }, + { + "epoch": 1.103050538776749, + "grad_norm": 0.5010141134262085, + "learning_rate": 8.16543484863825e-05, + "loss": 1.3363, + "step": 3634 + }, + { + "epoch": 1.1033540749734405, + "grad_norm": 0.5403134822845459, + "learning_rate": 8.16492862205123e-05, + "loss": 1.6684, + "step": 3635 + }, + { + "epoch": 1.103657611170132, + "grad_norm": 2.0790061950683594, + "learning_rate": 8.16442239546421e-05, + "loss": 1.7737, + "step": 3636 + }, + { + "epoch": 1.1039611473668236, + "grad_norm": 0.5232675671577454, + "learning_rate": 8.16391616887719e-05, + "loss": 1.8925, + "step": 3637 + }, + { + "epoch": 1.104264683563515, + "grad_norm": 0.5211415886878967, + "learning_rate": 8.16340994229017e-05, + "loss": 1.5844, + "step": 3638 + }, + { + "epoch": 1.1045682197602065, + "grad_norm": 0.4988497495651245, + "learning_rate": 8.16290371570315e-05, + "loss": 1.4171, + "step": 3639 + }, + { + "epoch": 1.104871755956898, + "grad_norm": 0.5389904975891113, + "learning_rate": 8.162397489116129e-05, + "loss": 1.9304, + "step": 3640 + }, + { + "epoch": 1.1051752921535893, + "grad_norm": 0.5663283467292786, + "learning_rate": 8.161891262529108e-05, + "loss": 1.7161, + "step": 3641 + }, + { + "epoch": 1.1054788283502808, + "grad_norm": 0.5647397637367249, + "learning_rate": 8.161385035942088e-05, + "loss": 1.4674, + "step": 3642 + }, + { + "epoch": 1.1057823645469722, + "grad_norm": 0.5939072966575623, + "learning_rate": 8.160878809355069e-05, + "loss": 1.0629, + "step": 3643 + }, + { + "epoch": 1.1060859007436636, + "grad_norm": 0.5135029554367065, + "learning_rate": 8.160372582768048e-05, + "loss": 1.705, + "step": 3644 + }, + { + "epoch": 1.106389436940355, + "grad_norm": 0.45538634061813354, + "learning_rate": 8.159866356181028e-05, + "loss": 1.6446, + "step": 3645 + }, + { + "epoch": 1.1066929731370465, + "grad_norm": 0.5306661128997803, + "learning_rate": 8.159360129594007e-05, + "loss": 1.2614, + "step": 3646 + }, + { + "epoch": 1.106996509333738, + "grad_norm": 0.5330759882926941, + "learning_rate": 8.158853903006987e-05, + "loss": 1.5618, + "step": 3647 + }, + { + "epoch": 1.1073000455304296, + "grad_norm": 0.4016754627227783, + "learning_rate": 8.158347676419966e-05, + "loss": 1.6048, + "step": 3648 + }, + { + "epoch": 1.107603581727121, + "grad_norm": 0.4580400288105011, + "learning_rate": 8.157841449832946e-05, + "loss": 2.0648, + "step": 3649 + }, + { + "epoch": 1.1079071179238125, + "grad_norm": 0.6977163553237915, + "learning_rate": 8.157335223245925e-05, + "loss": 1.2671, + "step": 3650 + }, + { + "epoch": 1.108210654120504, + "grad_norm": 0.5107961297035217, + "learning_rate": 8.156828996658905e-05, + "loss": 1.8615, + "step": 3651 + }, + { + "epoch": 1.1085141903171953, + "grad_norm": 0.5081155300140381, + "learning_rate": 8.156322770071885e-05, + "loss": 1.4985, + "step": 3652 + }, + { + "epoch": 1.1088177265138868, + "grad_norm": 0.5033918619155884, + "learning_rate": 8.155816543484865e-05, + "loss": 1.3806, + "step": 3653 + }, + { + "epoch": 1.1091212627105782, + "grad_norm": 0.4880038797855377, + "learning_rate": 8.155310316897844e-05, + "loss": 1.4382, + "step": 3654 + }, + { + "epoch": 1.1094247989072696, + "grad_norm": 0.5263247489929199, + "learning_rate": 8.154804090310824e-05, + "loss": 1.5557, + "step": 3655 + }, + { + "epoch": 1.109728335103961, + "grad_norm": 0.5132997632026672, + "learning_rate": 8.154297863723803e-05, + "loss": 1.6921, + "step": 3656 + }, + { + "epoch": 1.1100318713006525, + "grad_norm": 0.4751412570476532, + "learning_rate": 8.153791637136783e-05, + "loss": 1.4542, + "step": 3657 + }, + { + "epoch": 1.1103354074973442, + "grad_norm": 0.47098231315612793, + "learning_rate": 8.153285410549762e-05, + "loss": 1.7529, + "step": 3658 + }, + { + "epoch": 1.1106389436940356, + "grad_norm": 0.5147930979728699, + "learning_rate": 8.152779183962742e-05, + "loss": 1.6664, + "step": 3659 + }, + { + "epoch": 1.110942479890727, + "grad_norm": 0.557507336139679, + "learning_rate": 8.152272957375721e-05, + "loss": 1.4994, + "step": 3660 + }, + { + "epoch": 1.1112460160874185, + "grad_norm": 0.47173547744750977, + "learning_rate": 8.1517667307887e-05, + "loss": 1.7713, + "step": 3661 + }, + { + "epoch": 1.11154955228411, + "grad_norm": 0.42760956287384033, + "learning_rate": 8.151260504201682e-05, + "loss": 1.9084, + "step": 3662 + }, + { + "epoch": 1.1118530884808013, + "grad_norm": 0.6474758982658386, + "learning_rate": 8.150754277614661e-05, + "loss": 1.9066, + "step": 3663 + }, + { + "epoch": 1.1121566246774928, + "grad_norm": 0.4683403968811035, + "learning_rate": 8.15024805102764e-05, + "loss": 1.9612, + "step": 3664 + }, + { + "epoch": 1.1124601608741842, + "grad_norm": 0.4510393440723419, + "learning_rate": 8.14974182444062e-05, + "loss": 1.8173, + "step": 3665 + }, + { + "epoch": 1.1127636970708756, + "grad_norm": 0.4649883210659027, + "learning_rate": 8.1492355978536e-05, + "loss": 1.7189, + "step": 3666 + }, + { + "epoch": 1.113067233267567, + "grad_norm": 0.4617062509059906, + "learning_rate": 8.148729371266579e-05, + "loss": 1.5588, + "step": 3667 + }, + { + "epoch": 1.1133707694642587, + "grad_norm": 0.38514718413352966, + "learning_rate": 8.148223144679558e-05, + "loss": 1.2109, + "step": 3668 + }, + { + "epoch": 1.1136743056609502, + "grad_norm": 0.5455525517463684, + "learning_rate": 8.147716918092538e-05, + "loss": 1.3428, + "step": 3669 + }, + { + "epoch": 1.1139778418576416, + "grad_norm": 0.47001487016677856, + "learning_rate": 8.147210691505517e-05, + "loss": 1.8286, + "step": 3670 + }, + { + "epoch": 1.114281378054333, + "grad_norm": 0.9772850871086121, + "learning_rate": 8.146704464918498e-05, + "loss": 1.5406, + "step": 3671 + }, + { + "epoch": 1.1145849142510245, + "grad_norm": 0.5333663821220398, + "learning_rate": 8.146198238331478e-05, + "loss": 1.5502, + "step": 3672 + }, + { + "epoch": 1.114888450447716, + "grad_norm": 0.5091346502304077, + "learning_rate": 8.145692011744457e-05, + "loss": 1.6799, + "step": 3673 + }, + { + "epoch": 1.1151919866444073, + "grad_norm": 0.6163585186004639, + "learning_rate": 8.145185785157437e-05, + "loss": 1.4412, + "step": 3674 + }, + { + "epoch": 1.1154955228410988, + "grad_norm": 0.5357707142829895, + "learning_rate": 8.144679558570416e-05, + "loss": 1.6769, + "step": 3675 + }, + { + "epoch": 1.1157990590377902, + "grad_norm": 0.45370087027549744, + "learning_rate": 8.144173331983396e-05, + "loss": 1.5989, + "step": 3676 + }, + { + "epoch": 1.1161025952344816, + "grad_norm": 0.4834679067134857, + "learning_rate": 8.143667105396375e-05, + "loss": 1.4882, + "step": 3677 + }, + { + "epoch": 1.116406131431173, + "grad_norm": 0.5106636881828308, + "learning_rate": 8.143160878809355e-05, + "loss": 1.414, + "step": 3678 + }, + { + "epoch": 1.1167096676278647, + "grad_norm": 0.4484593868255615, + "learning_rate": 8.142654652222334e-05, + "loss": 1.7469, + "step": 3679 + }, + { + "epoch": 1.1170132038245562, + "grad_norm": 0.43629521131515503, + "learning_rate": 8.142148425635315e-05, + "loss": 1.6167, + "step": 3680 + }, + { + "epoch": 1.1173167400212476, + "grad_norm": 0.46599555015563965, + "learning_rate": 8.141642199048294e-05, + "loss": 1.6053, + "step": 3681 + }, + { + "epoch": 1.117620276217939, + "grad_norm": 0.46938660740852356, + "learning_rate": 8.141135972461275e-05, + "loss": 1.4003, + "step": 3682 + }, + { + "epoch": 1.1179238124146305, + "grad_norm": 0.5234637260437012, + "learning_rate": 8.140629745874255e-05, + "loss": 1.8717, + "step": 3683 + }, + { + "epoch": 1.118227348611322, + "grad_norm": 0.5148733854293823, + "learning_rate": 8.140123519287234e-05, + "loss": 1.8073, + "step": 3684 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.4216020405292511, + "learning_rate": 8.139617292700214e-05, + "loss": 1.3997, + "step": 3685 + }, + { + "epoch": 1.1188344210047048, + "grad_norm": 0.44757702946662903, + "learning_rate": 8.139111066113193e-05, + "loss": 1.7372, + "step": 3686 + }, + { + "epoch": 1.1191379572013962, + "grad_norm": 0.4530711770057678, + "learning_rate": 8.138604839526173e-05, + "loss": 1.6412, + "step": 3687 + }, + { + "epoch": 1.1194414933980876, + "grad_norm": 0.45930105447769165, + "learning_rate": 8.138098612939152e-05, + "loss": 1.8236, + "step": 3688 + }, + { + "epoch": 1.119745029594779, + "grad_norm": 0.4823213219642639, + "learning_rate": 8.137592386352132e-05, + "loss": 1.4396, + "step": 3689 + }, + { + "epoch": 1.1200485657914707, + "grad_norm": 0.46218788623809814, + "learning_rate": 8.137086159765111e-05, + "loss": 1.7306, + "step": 3690 + }, + { + "epoch": 1.1203521019881622, + "grad_norm": 0.47568848729133606, + "learning_rate": 8.136579933178092e-05, + "loss": 1.7071, + "step": 3691 + }, + { + "epoch": 1.1206556381848536, + "grad_norm": 0.491416335105896, + "learning_rate": 8.136073706591071e-05, + "loss": 1.4723, + "step": 3692 + }, + { + "epoch": 1.120959174381545, + "grad_norm": 0.5410199761390686, + "learning_rate": 8.135567480004051e-05, + "loss": 1.325, + "step": 3693 + }, + { + "epoch": 1.1212627105782365, + "grad_norm": 0.49941226840019226, + "learning_rate": 8.13506125341703e-05, + "loss": 1.717, + "step": 3694 + }, + { + "epoch": 1.1215662467749279, + "grad_norm": 0.5728047490119934, + "learning_rate": 8.13455502683001e-05, + "loss": 1.7745, + "step": 3695 + }, + { + "epoch": 1.1218697829716193, + "grad_norm": 0.5127580165863037, + "learning_rate": 8.134048800242989e-05, + "loss": 1.9077, + "step": 3696 + }, + { + "epoch": 1.1221733191683108, + "grad_norm": 0.7718825340270996, + "learning_rate": 8.133542573655969e-05, + "loss": 1.6438, + "step": 3697 + }, + { + "epoch": 1.1224768553650022, + "grad_norm": 0.48605823516845703, + "learning_rate": 8.133036347068948e-05, + "loss": 1.7385, + "step": 3698 + }, + { + "epoch": 1.1227803915616938, + "grad_norm": 0.7080191969871521, + "learning_rate": 8.132530120481928e-05, + "loss": 1.6901, + "step": 3699 + }, + { + "epoch": 1.1230839277583853, + "grad_norm": 0.5506136417388916, + "learning_rate": 8.132023893894907e-05, + "loss": 1.451, + "step": 3700 + }, + { + "epoch": 1.1233874639550767, + "grad_norm": 0.40935492515563965, + "learning_rate": 8.131517667307888e-05, + "loss": 1.1997, + "step": 3701 + }, + { + "epoch": 1.1236910001517681, + "grad_norm": 0.695495069026947, + "learning_rate": 8.131011440720867e-05, + "loss": 1.5494, + "step": 3702 + }, + { + "epoch": 1.1239945363484596, + "grad_norm": 0.5104714035987854, + "learning_rate": 8.130505214133847e-05, + "loss": 1.2269, + "step": 3703 + }, + { + "epoch": 1.124298072545151, + "grad_norm": 0.5129423141479492, + "learning_rate": 8.129998987546826e-05, + "loss": 1.5293, + "step": 3704 + }, + { + "epoch": 1.1246016087418425, + "grad_norm": 0.4727494716644287, + "learning_rate": 8.129492760959806e-05, + "loss": 1.6413, + "step": 3705 + }, + { + "epoch": 1.1249051449385339, + "grad_norm": 0.4820862114429474, + "learning_rate": 8.128986534372785e-05, + "loss": 1.1424, + "step": 3706 + }, + { + "epoch": 1.1252086811352253, + "grad_norm": 0.37725722789764404, + "learning_rate": 8.128480307785765e-05, + "loss": 1.7004, + "step": 3707 + }, + { + "epoch": 1.1255122173319168, + "grad_norm": 0.4898647964000702, + "learning_rate": 8.127974081198744e-05, + "loss": 1.8321, + "step": 3708 + }, + { + "epoch": 1.1258157535286082, + "grad_norm": 0.5332454442977905, + "learning_rate": 8.127467854611724e-05, + "loss": 1.4906, + "step": 3709 + }, + { + "epoch": 1.1261192897252998, + "grad_norm": 1.1369999647140503, + "learning_rate": 8.126961628024705e-05, + "loss": 1.8496, + "step": 3710 + }, + { + "epoch": 1.1264228259219913, + "grad_norm": 0.4872293472290039, + "learning_rate": 8.126455401437684e-05, + "loss": 1.6577, + "step": 3711 + }, + { + "epoch": 1.1267263621186827, + "grad_norm": 0.5259954333305359, + "learning_rate": 8.125949174850664e-05, + "loss": 1.3823, + "step": 3712 + }, + { + "epoch": 1.1270298983153741, + "grad_norm": 0.44852215051651, + "learning_rate": 8.125442948263643e-05, + "loss": 1.7758, + "step": 3713 + }, + { + "epoch": 1.1273334345120656, + "grad_norm": 0.49278607964515686, + "learning_rate": 8.124936721676623e-05, + "loss": 1.5611, + "step": 3714 + }, + { + "epoch": 1.127636970708757, + "grad_norm": 0.5423696041107178, + "learning_rate": 8.124430495089602e-05, + "loss": 1.5364, + "step": 3715 + }, + { + "epoch": 1.1279405069054484, + "grad_norm": 0.8286866545677185, + "learning_rate": 8.123924268502582e-05, + "loss": 1.538, + "step": 3716 + }, + { + "epoch": 1.1282440431021399, + "grad_norm": 0.4774687886238098, + "learning_rate": 8.123418041915561e-05, + "loss": 1.9607, + "step": 3717 + }, + { + "epoch": 1.1285475792988313, + "grad_norm": 0.5174298286437988, + "learning_rate": 8.12291181532854e-05, + "loss": 1.4305, + "step": 3718 + }, + { + "epoch": 1.1288511154955228, + "grad_norm": 0.4205905795097351, + "learning_rate": 8.122405588741521e-05, + "loss": 1.6605, + "step": 3719 + }, + { + "epoch": 1.1291546516922142, + "grad_norm": 0.49820128083229065, + "learning_rate": 8.121899362154501e-05, + "loss": 1.6201, + "step": 3720 + }, + { + "epoch": 1.1294581878889058, + "grad_norm": 0.6406720280647278, + "learning_rate": 8.12139313556748e-05, + "loss": 1.645, + "step": 3721 + }, + { + "epoch": 1.1297617240855973, + "grad_norm": 0.5179185271263123, + "learning_rate": 8.12088690898046e-05, + "loss": 1.2224, + "step": 3722 + }, + { + "epoch": 1.1300652602822887, + "grad_norm": 0.6311380863189697, + "learning_rate": 8.120380682393439e-05, + "loss": 1.8095, + "step": 3723 + }, + { + "epoch": 1.1303687964789801, + "grad_norm": 0.6742674112319946, + "learning_rate": 8.119874455806419e-05, + "loss": 1.2631, + "step": 3724 + }, + { + "epoch": 1.1306723326756716, + "grad_norm": 0.857227623462677, + "learning_rate": 8.1193682292194e-05, + "loss": 1.6925, + "step": 3725 + }, + { + "epoch": 1.130975868872363, + "grad_norm": 0.46885544061660767, + "learning_rate": 8.118862002632379e-05, + "loss": 1.601, + "step": 3726 + }, + { + "epoch": 1.1312794050690544, + "grad_norm": 0.4162534773349762, + "learning_rate": 8.118355776045359e-05, + "loss": 1.8378, + "step": 3727 + }, + { + "epoch": 1.1315829412657459, + "grad_norm": 0.42384031414985657, + "learning_rate": 8.117849549458338e-05, + "loss": 1.9497, + "step": 3728 + }, + { + "epoch": 1.1318864774624373, + "grad_norm": 0.5691695213317871, + "learning_rate": 8.117343322871317e-05, + "loss": 1.6685, + "step": 3729 + }, + { + "epoch": 1.132190013659129, + "grad_norm": 0.5050380229949951, + "learning_rate": 8.116837096284298e-05, + "loss": 1.6185, + "step": 3730 + }, + { + "epoch": 1.1324935498558204, + "grad_norm": 0.5230247974395752, + "learning_rate": 8.116330869697278e-05, + "loss": 1.5512, + "step": 3731 + }, + { + "epoch": 1.1327970860525118, + "grad_norm": 0.5464157462120056, + "learning_rate": 8.115824643110257e-05, + "loss": 1.6746, + "step": 3732 + }, + { + "epoch": 1.1331006222492033, + "grad_norm": 0.5749616622924805, + "learning_rate": 8.115318416523237e-05, + "loss": 1.572, + "step": 3733 + }, + { + "epoch": 1.1334041584458947, + "grad_norm": 0.6497030854225159, + "learning_rate": 8.114812189936216e-05, + "loss": 1.1994, + "step": 3734 + }, + { + "epoch": 1.1337076946425861, + "grad_norm": 0.4995471239089966, + "learning_rate": 8.114305963349196e-05, + "loss": 1.7605, + "step": 3735 + }, + { + "epoch": 1.1340112308392776, + "grad_norm": 0.4408343732357025, + "learning_rate": 8.113799736762175e-05, + "loss": 1.164, + "step": 3736 + }, + { + "epoch": 1.134314767035969, + "grad_norm": 0.49405989050865173, + "learning_rate": 8.113293510175155e-05, + "loss": 1.2219, + "step": 3737 + }, + { + "epoch": 1.1346183032326604, + "grad_norm": 0.5476012825965881, + "learning_rate": 8.112787283588134e-05, + "loss": 1.7411, + "step": 3738 + }, + { + "epoch": 1.1349218394293519, + "grad_norm": 0.4971032738685608, + "learning_rate": 8.112281057001114e-05, + "loss": 1.9983, + "step": 3739 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.5163978338241577, + "learning_rate": 8.111774830414094e-05, + "loss": 1.5051, + "step": 3740 + }, + { + "epoch": 1.135528911822735, + "grad_norm": 0.41611355543136597, + "learning_rate": 8.111268603827074e-05, + "loss": 1.3488, + "step": 3741 + }, + { + "epoch": 1.1358324480194264, + "grad_norm": 0.595660924911499, + "learning_rate": 8.110762377240053e-05, + "loss": 1.1249, + "step": 3742 + }, + { + "epoch": 1.1361359842161178, + "grad_norm": 0.5125541090965271, + "learning_rate": 8.110256150653033e-05, + "loss": 1.3889, + "step": 3743 + }, + { + "epoch": 1.1364395204128093, + "grad_norm": 0.5227445960044861, + "learning_rate": 8.109749924066012e-05, + "loss": 1.8178, + "step": 3744 + }, + { + "epoch": 1.1367430566095007, + "grad_norm": 0.551460325717926, + "learning_rate": 8.109243697478992e-05, + "loss": 1.5691, + "step": 3745 + }, + { + "epoch": 1.1370465928061921, + "grad_norm": 0.5809578895568848, + "learning_rate": 8.108737470891971e-05, + "loss": 1.6748, + "step": 3746 + }, + { + "epoch": 1.1373501290028836, + "grad_norm": 0.5961951017379761, + "learning_rate": 8.108231244304951e-05, + "loss": 1.7098, + "step": 3747 + }, + { + "epoch": 1.137653665199575, + "grad_norm": 0.5831936597824097, + "learning_rate": 8.10772501771793e-05, + "loss": 1.7794, + "step": 3748 + }, + { + "epoch": 1.1379572013962664, + "grad_norm": 0.5263493657112122, + "learning_rate": 8.107218791130911e-05, + "loss": 1.5674, + "step": 3749 + }, + { + "epoch": 1.1382607375929579, + "grad_norm": 0.7186266183853149, + "learning_rate": 8.10671256454389e-05, + "loss": 1.4561, + "step": 3750 + }, + { + "epoch": 1.1385642737896493, + "grad_norm": 0.4602220952510834, + "learning_rate": 8.10620633795687e-05, + "loss": 1.9415, + "step": 3751 + }, + { + "epoch": 1.138867809986341, + "grad_norm": 0.6327370405197144, + "learning_rate": 8.10570011136985e-05, + "loss": 1.4777, + "step": 3752 + }, + { + "epoch": 1.1391713461830324, + "grad_norm": 0.5156751275062561, + "learning_rate": 8.105193884782829e-05, + "loss": 1.6268, + "step": 3753 + }, + { + "epoch": 1.1394748823797238, + "grad_norm": 0.5427886247634888, + "learning_rate": 8.104687658195809e-05, + "loss": 1.7466, + "step": 3754 + }, + { + "epoch": 1.1397784185764153, + "grad_norm": 0.4821757972240448, + "learning_rate": 8.104181431608788e-05, + "loss": 1.8944, + "step": 3755 + }, + { + "epoch": 1.1400819547731067, + "grad_norm": 0.5586848258972168, + "learning_rate": 8.103675205021767e-05, + "loss": 1.4175, + "step": 3756 + }, + { + "epoch": 1.1403854909697981, + "grad_norm": 0.4749287962913513, + "learning_rate": 8.103168978434747e-05, + "loss": 1.71, + "step": 3757 + }, + { + "epoch": 1.1406890271664896, + "grad_norm": 0.5775233507156372, + "learning_rate": 8.102662751847728e-05, + "loss": 1.2132, + "step": 3758 + }, + { + "epoch": 1.140992563363181, + "grad_norm": 0.8754368424415588, + "learning_rate": 8.102156525260707e-05, + "loss": 1.6928, + "step": 3759 + }, + { + "epoch": 1.1412960995598724, + "grad_norm": 0.512400209903717, + "learning_rate": 8.101650298673687e-05, + "loss": 1.7876, + "step": 3760 + }, + { + "epoch": 1.141599635756564, + "grad_norm": 0.5088605284690857, + "learning_rate": 8.101144072086666e-05, + "loss": 1.5775, + "step": 3761 + }, + { + "epoch": 1.1419031719532553, + "grad_norm": 0.5294544100761414, + "learning_rate": 8.100637845499646e-05, + "loss": 1.7209, + "step": 3762 + }, + { + "epoch": 1.142206708149947, + "grad_norm": 0.534830629825592, + "learning_rate": 8.100131618912625e-05, + "loss": 1.59, + "step": 3763 + }, + { + "epoch": 1.1425102443466384, + "grad_norm": 0.5802815556526184, + "learning_rate": 8.099625392325605e-05, + "loss": 1.6018, + "step": 3764 + }, + { + "epoch": 1.1428137805433298, + "grad_norm": 0.5445258021354675, + "learning_rate": 8.099119165738584e-05, + "loss": 1.1691, + "step": 3765 + }, + { + "epoch": 1.1431173167400213, + "grad_norm": 0.7283796072006226, + "learning_rate": 8.098612939151564e-05, + "loss": 1.592, + "step": 3766 + }, + { + "epoch": 1.1434208529367127, + "grad_norm": 0.5029062032699585, + "learning_rate": 8.098106712564543e-05, + "loss": 1.6991, + "step": 3767 + }, + { + "epoch": 1.1437243891334041, + "grad_norm": 0.5014142394065857, + "learning_rate": 8.097600485977524e-05, + "loss": 1.4275, + "step": 3768 + }, + { + "epoch": 1.1440279253300956, + "grad_norm": 0.5292351245880127, + "learning_rate": 8.097094259390503e-05, + "loss": 1.4919, + "step": 3769 + }, + { + "epoch": 1.144331461526787, + "grad_norm": 0.47075581550598145, + "learning_rate": 8.096588032803484e-05, + "loss": 1.1845, + "step": 3770 + }, + { + "epoch": 1.1446349977234784, + "grad_norm": 1.0443851947784424, + "learning_rate": 8.096081806216464e-05, + "loss": 1.7547, + "step": 3771 + }, + { + "epoch": 1.14493853392017, + "grad_norm": 0.4734131991863251, + "learning_rate": 8.095575579629443e-05, + "loss": 1.7828, + "step": 3772 + }, + { + "epoch": 1.1452420701168615, + "grad_norm": 0.5368636250495911, + "learning_rate": 8.095069353042423e-05, + "loss": 1.7424, + "step": 3773 + }, + { + "epoch": 1.145545606313553, + "grad_norm": 0.42162859439849854, + "learning_rate": 8.094563126455402e-05, + "loss": 1.6908, + "step": 3774 + }, + { + "epoch": 1.1458491425102444, + "grad_norm": 0.5565983057022095, + "learning_rate": 8.094056899868382e-05, + "loss": 1.7771, + "step": 3775 + }, + { + "epoch": 1.1461526787069358, + "grad_norm": 1.02000892162323, + "learning_rate": 8.093550673281361e-05, + "loss": 1.416, + "step": 3776 + }, + { + "epoch": 1.1464562149036273, + "grad_norm": 0.5681176781654358, + "learning_rate": 8.09304444669434e-05, + "loss": 1.6256, + "step": 3777 + }, + { + "epoch": 1.1467597511003187, + "grad_norm": 0.47247472405433655, + "learning_rate": 8.09253822010732e-05, + "loss": 1.1948, + "step": 3778 + }, + { + "epoch": 1.1470632872970101, + "grad_norm": 0.5024043321609497, + "learning_rate": 8.092031993520301e-05, + "loss": 1.749, + "step": 3779 + }, + { + "epoch": 1.1473668234937016, + "grad_norm": 0.45547524094581604, + "learning_rate": 8.09152576693328e-05, + "loss": 1.9116, + "step": 3780 + }, + { + "epoch": 1.147670359690393, + "grad_norm": 0.5225228071212769, + "learning_rate": 8.09101954034626e-05, + "loss": 1.5611, + "step": 3781 + }, + { + "epoch": 1.1479738958870844, + "grad_norm": 0.5099766850471497, + "learning_rate": 8.09051331375924e-05, + "loss": 1.6613, + "step": 3782 + }, + { + "epoch": 1.148277432083776, + "grad_norm": 0.5814756751060486, + "learning_rate": 8.090007087172219e-05, + "loss": 1.7684, + "step": 3783 + }, + { + "epoch": 1.1485809682804675, + "grad_norm": 0.5852969884872437, + "learning_rate": 8.089500860585198e-05, + "loss": 1.8952, + "step": 3784 + }, + { + "epoch": 1.148884504477159, + "grad_norm": 1.107011318206787, + "learning_rate": 8.088994633998178e-05, + "loss": 0.8599, + "step": 3785 + }, + { + "epoch": 1.1491880406738504, + "grad_norm": 0.570236086845398, + "learning_rate": 8.088488407411157e-05, + "loss": 1.7961, + "step": 3786 + }, + { + "epoch": 1.1494915768705418, + "grad_norm": 0.6179838180541992, + "learning_rate": 8.087982180824137e-05, + "loss": 1.3554, + "step": 3787 + }, + { + "epoch": 1.1497951130672333, + "grad_norm": 0.4929959177970886, + "learning_rate": 8.087475954237118e-05, + "loss": 1.7513, + "step": 3788 + }, + { + "epoch": 1.1500986492639247, + "grad_norm": 0.48527392745018005, + "learning_rate": 8.086969727650097e-05, + "loss": 1.7123, + "step": 3789 + }, + { + "epoch": 1.1504021854606161, + "grad_norm": 0.4290766417980194, + "learning_rate": 8.086463501063077e-05, + "loss": 1.8191, + "step": 3790 + }, + { + "epoch": 1.1507057216573076, + "grad_norm": 0.448541522026062, + "learning_rate": 8.085957274476056e-05, + "loss": 0.9604, + "step": 3791 + }, + { + "epoch": 1.1510092578539992, + "grad_norm": 0.41005972027778625, + "learning_rate": 8.085451047889036e-05, + "loss": 1.3889, + "step": 3792 + }, + { + "epoch": 1.1513127940506904, + "grad_norm": 0.5160661935806274, + "learning_rate": 8.084944821302015e-05, + "loss": 1.7646, + "step": 3793 + }, + { + "epoch": 1.151616330247382, + "grad_norm": 0.5291827321052551, + "learning_rate": 8.084438594714994e-05, + "loss": 1.7524, + "step": 3794 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.47271502017974854, + "learning_rate": 8.083932368127974e-05, + "loss": 1.8938, + "step": 3795 + }, + { + "epoch": 1.152223402640765, + "grad_norm": 0.42168867588043213, + "learning_rate": 8.083426141540953e-05, + "loss": 1.5912, + "step": 3796 + }, + { + "epoch": 1.1525269388374564, + "grad_norm": 0.687078058719635, + "learning_rate": 8.082919914953934e-05, + "loss": 1.7467, + "step": 3797 + }, + { + "epoch": 1.1528304750341478, + "grad_norm": 0.49269938468933105, + "learning_rate": 8.082413688366914e-05, + "loss": 1.4631, + "step": 3798 + }, + { + "epoch": 1.1531340112308393, + "grad_norm": 0.49438297748565674, + "learning_rate": 8.081907461779893e-05, + "loss": 1.5982, + "step": 3799 + }, + { + "epoch": 1.1534375474275307, + "grad_norm": 0.5099816918373108, + "learning_rate": 8.081401235192873e-05, + "loss": 1.6595, + "step": 3800 + }, + { + "epoch": 1.1537410836242221, + "grad_norm": 0.5655848979949951, + "learning_rate": 8.080895008605852e-05, + "loss": 1.3832, + "step": 3801 + }, + { + "epoch": 1.1540446198209136, + "grad_norm": 0.5156016945838928, + "learning_rate": 8.080388782018832e-05, + "loss": 1.3481, + "step": 3802 + }, + { + "epoch": 1.1543481560176052, + "grad_norm": 0.3963063657283783, + "learning_rate": 8.079882555431811e-05, + "loss": 1.1331, + "step": 3803 + }, + { + "epoch": 1.1546516922142966, + "grad_norm": 0.5562106370925903, + "learning_rate": 8.07937632884479e-05, + "loss": 1.7705, + "step": 3804 + }, + { + "epoch": 1.154955228410988, + "grad_norm": 0.5723055601119995, + "learning_rate": 8.07887010225777e-05, + "loss": 1.6301, + "step": 3805 + }, + { + "epoch": 1.1552587646076795, + "grad_norm": 0.598197877407074, + "learning_rate": 8.07836387567075e-05, + "loss": 1.1021, + "step": 3806 + }, + { + "epoch": 1.155562300804371, + "grad_norm": 0.47882279753685, + "learning_rate": 8.07785764908373e-05, + "loss": 1.7258, + "step": 3807 + }, + { + "epoch": 1.1558658370010624, + "grad_norm": 0.777834951877594, + "learning_rate": 8.07735142249671e-05, + "loss": 2.1178, + "step": 3808 + }, + { + "epoch": 1.1561693731977538, + "grad_norm": 0.5457454919815063, + "learning_rate": 8.07684519590969e-05, + "loss": 1.6053, + "step": 3809 + }, + { + "epoch": 1.1564729093944452, + "grad_norm": 0.6933251023292542, + "learning_rate": 8.076338969322669e-05, + "loss": 1.2324, + "step": 3810 + }, + { + "epoch": 1.1567764455911367, + "grad_norm": 0.7639228701591492, + "learning_rate": 8.075832742735648e-05, + "loss": 1.5564, + "step": 3811 + }, + { + "epoch": 1.1570799817878281, + "grad_norm": 0.5342271327972412, + "learning_rate": 8.075326516148628e-05, + "loss": 1.6796, + "step": 3812 + }, + { + "epoch": 1.1573835179845195, + "grad_norm": 0.49576303362846375, + "learning_rate": 8.074820289561607e-05, + "loss": 1.1671, + "step": 3813 + }, + { + "epoch": 1.1576870541812112, + "grad_norm": 0.45418110489845276, + "learning_rate": 8.074314062974588e-05, + "loss": 1.821, + "step": 3814 + }, + { + "epoch": 1.1579905903779026, + "grad_norm": 0.49441277980804443, + "learning_rate": 8.073807836387568e-05, + "loss": 1.6599, + "step": 3815 + }, + { + "epoch": 1.158294126574594, + "grad_norm": 0.5793106555938721, + "learning_rate": 8.073301609800547e-05, + "loss": 1.7381, + "step": 3816 + }, + { + "epoch": 1.1585976627712855, + "grad_norm": 0.6442940831184387, + "learning_rate": 8.072795383213527e-05, + "loss": 2.0495, + "step": 3817 + }, + { + "epoch": 1.158901198967977, + "grad_norm": 0.5308768153190613, + "learning_rate": 8.072289156626507e-05, + "loss": 1.6042, + "step": 3818 + }, + { + "epoch": 1.1592047351646684, + "grad_norm": 0.4919055104255676, + "learning_rate": 8.071782930039487e-05, + "loss": 1.4157, + "step": 3819 + }, + { + "epoch": 1.1595082713613598, + "grad_norm": 0.506922721862793, + "learning_rate": 8.071276703452466e-05, + "loss": 1.5703, + "step": 3820 + }, + { + "epoch": 1.1598118075580512, + "grad_norm": 0.5807430148124695, + "learning_rate": 8.070770476865446e-05, + "loss": 1.4976, + "step": 3821 + }, + { + "epoch": 1.1601153437547427, + "grad_norm": 0.489700049161911, + "learning_rate": 8.070264250278425e-05, + "loss": 1.0653, + "step": 3822 + }, + { + "epoch": 1.1604188799514341, + "grad_norm": 0.4885237514972687, + "learning_rate": 8.069758023691405e-05, + "loss": 1.4299, + "step": 3823 + }, + { + "epoch": 1.1607224161481255, + "grad_norm": 0.6799513697624207, + "learning_rate": 8.069251797104384e-05, + "loss": 1.2705, + "step": 3824 + }, + { + "epoch": 1.1610259523448172, + "grad_norm": 0.42571133375167847, + "learning_rate": 8.068745570517364e-05, + "loss": 1.2678, + "step": 3825 + }, + { + "epoch": 1.1613294885415086, + "grad_norm": 0.4387352466583252, + "learning_rate": 8.068239343930343e-05, + "loss": 1.7297, + "step": 3826 + }, + { + "epoch": 1.1616330247382, + "grad_norm": 0.4486967921257019, + "learning_rate": 8.067733117343324e-05, + "loss": 1.424, + "step": 3827 + }, + { + "epoch": 1.1619365609348915, + "grad_norm": 0.5393472909927368, + "learning_rate": 8.067226890756304e-05, + "loss": 1.4029, + "step": 3828 + }, + { + "epoch": 1.162240097131583, + "grad_norm": 0.5873778462409973, + "learning_rate": 8.066720664169283e-05, + "loss": 1.713, + "step": 3829 + }, + { + "epoch": 1.1625436333282744, + "grad_norm": 0.5121496915817261, + "learning_rate": 8.066214437582263e-05, + "loss": 1.6434, + "step": 3830 + }, + { + "epoch": 1.1628471695249658, + "grad_norm": 0.5076146125793457, + "learning_rate": 8.065708210995242e-05, + "loss": 1.4398, + "step": 3831 + }, + { + "epoch": 1.1631507057216572, + "grad_norm": 0.5533864498138428, + "learning_rate": 8.065201984408221e-05, + "loss": 1.4061, + "step": 3832 + }, + { + "epoch": 1.1634542419183487, + "grad_norm": 0.5335582494735718, + "learning_rate": 8.064695757821201e-05, + "loss": 1.8173, + "step": 3833 + }, + { + "epoch": 1.1637577781150403, + "grad_norm": 0.45890846848487854, + "learning_rate": 8.06418953123418e-05, + "loss": 1.7913, + "step": 3834 + }, + { + "epoch": 1.1640613143117318, + "grad_norm": 0.5167157053947449, + "learning_rate": 8.06368330464716e-05, + "loss": 1.7257, + "step": 3835 + }, + { + "epoch": 1.1643648505084232, + "grad_norm": 0.5924651622772217, + "learning_rate": 8.063177078060141e-05, + "loss": 1.7063, + "step": 3836 + }, + { + "epoch": 1.1646683867051146, + "grad_norm": 0.48032063245773315, + "learning_rate": 8.06267085147312e-05, + "loss": 1.6812, + "step": 3837 + }, + { + "epoch": 1.164971922901806, + "grad_norm": 0.4973551034927368, + "learning_rate": 8.0621646248861e-05, + "loss": 1.5737, + "step": 3838 + }, + { + "epoch": 1.1652754590984975, + "grad_norm": 0.5250328183174133, + "learning_rate": 8.061658398299079e-05, + "loss": 1.2556, + "step": 3839 + }, + { + "epoch": 1.165578995295189, + "grad_norm": 0.4585050642490387, + "learning_rate": 8.061152171712059e-05, + "loss": 1.0296, + "step": 3840 + }, + { + "epoch": 1.1658825314918804, + "grad_norm": 0.5470108389854431, + "learning_rate": 8.060645945125038e-05, + "loss": 1.9698, + "step": 3841 + }, + { + "epoch": 1.1661860676885718, + "grad_norm": 0.512869119644165, + "learning_rate": 8.060139718538018e-05, + "loss": 1.6972, + "step": 3842 + }, + { + "epoch": 1.1664896038852632, + "grad_norm": 0.5125742554664612, + "learning_rate": 8.059633491950997e-05, + "loss": 1.8175, + "step": 3843 + }, + { + "epoch": 1.1667931400819547, + "grad_norm": 0.4400027394294739, + "learning_rate": 8.059127265363977e-05, + "loss": 1.559, + "step": 3844 + }, + { + "epoch": 1.1670966762786463, + "grad_norm": 0.695661187171936, + "learning_rate": 8.058621038776956e-05, + "loss": 1.7339, + "step": 3845 + }, + { + "epoch": 1.1674002124753378, + "grad_norm": 0.7078722715377808, + "learning_rate": 8.058114812189937e-05, + "loss": 1.8629, + "step": 3846 + }, + { + "epoch": 1.1677037486720292, + "grad_norm": 0.6370052099227905, + "learning_rate": 8.057608585602916e-05, + "loss": 1.3688, + "step": 3847 + }, + { + "epoch": 1.1680072848687206, + "grad_norm": 0.5090733766555786, + "learning_rate": 8.057102359015896e-05, + "loss": 1.5642, + "step": 3848 + }, + { + "epoch": 1.168310821065412, + "grad_norm": 0.6681460738182068, + "learning_rate": 8.056596132428875e-05, + "loss": 1.6192, + "step": 3849 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.507787823677063, + "learning_rate": 8.056089905841855e-05, + "loss": 1.789, + "step": 3850 + }, + { + "epoch": 1.168917893458795, + "grad_norm": 0.5228447318077087, + "learning_rate": 8.055583679254834e-05, + "loss": 1.4613, + "step": 3851 + }, + { + "epoch": 1.1692214296554864, + "grad_norm": 0.43099793791770935, + "learning_rate": 8.055077452667814e-05, + "loss": 1.4872, + "step": 3852 + }, + { + "epoch": 1.1695249658521778, + "grad_norm": 0.5265125036239624, + "learning_rate": 8.054571226080793e-05, + "loss": 1.4256, + "step": 3853 + }, + { + "epoch": 1.1698285020488692, + "grad_norm": 0.4951777458190918, + "learning_rate": 8.054064999493773e-05, + "loss": 1.5817, + "step": 3854 + }, + { + "epoch": 1.1701320382455607, + "grad_norm": 0.6004377007484436, + "learning_rate": 8.053558772906754e-05, + "loss": 1.6111, + "step": 3855 + }, + { + "epoch": 1.1704355744422523, + "grad_norm": 0.43849846720695496, + "learning_rate": 8.053052546319733e-05, + "loss": 1.8428, + "step": 3856 + }, + { + "epoch": 1.1707391106389438, + "grad_norm": 0.5169385075569153, + "learning_rate": 8.052546319732713e-05, + "loss": 1.9268, + "step": 3857 + }, + { + "epoch": 1.1710426468356352, + "grad_norm": 0.5246443152427673, + "learning_rate": 8.052040093145692e-05, + "loss": 1.5416, + "step": 3858 + }, + { + "epoch": 1.1713461830323266, + "grad_norm": 0.6965652704238892, + "learning_rate": 8.051533866558673e-05, + "loss": 1.9555, + "step": 3859 + }, + { + "epoch": 1.171649719229018, + "grad_norm": 0.5366235971450806, + "learning_rate": 8.051027639971652e-05, + "loss": 1.5321, + "step": 3860 + }, + { + "epoch": 1.1719532554257095, + "grad_norm": 0.4586010277271271, + "learning_rate": 8.050521413384632e-05, + "loss": 1.7923, + "step": 3861 + }, + { + "epoch": 1.172256791622401, + "grad_norm": 0.467289000749588, + "learning_rate": 8.050015186797611e-05, + "loss": 1.5372, + "step": 3862 + }, + { + "epoch": 1.1725603278190924, + "grad_norm": 0.47332337498664856, + "learning_rate": 8.049508960210591e-05, + "loss": 1.8386, + "step": 3863 + }, + { + "epoch": 1.1728638640157838, + "grad_norm": 0.4423415958881378, + "learning_rate": 8.04900273362357e-05, + "loss": 1.6493, + "step": 3864 + }, + { + "epoch": 1.1731674002124755, + "grad_norm": 0.5270376801490784, + "learning_rate": 8.04849650703655e-05, + "loss": 1.6646, + "step": 3865 + }, + { + "epoch": 1.1734709364091669, + "grad_norm": 0.45497021079063416, + "learning_rate": 8.04799028044953e-05, + "loss": 1.8518, + "step": 3866 + }, + { + "epoch": 1.1737744726058583, + "grad_norm": 0.4662345349788666, + "learning_rate": 8.04748405386251e-05, + "loss": 1.1371, + "step": 3867 + }, + { + "epoch": 1.1740780088025498, + "grad_norm": 0.6664243340492249, + "learning_rate": 8.04697782727549e-05, + "loss": 1.4377, + "step": 3868 + }, + { + "epoch": 1.1743815449992412, + "grad_norm": 0.5250559449195862, + "learning_rate": 8.046471600688469e-05, + "loss": 1.5832, + "step": 3869 + }, + { + "epoch": 1.1746850811959326, + "grad_norm": 0.6499760746955872, + "learning_rate": 8.045965374101448e-05, + "loss": 2.0563, + "step": 3870 + }, + { + "epoch": 1.174988617392624, + "grad_norm": 0.5033279061317444, + "learning_rate": 8.045459147514428e-05, + "loss": 1.6149, + "step": 3871 + }, + { + "epoch": 1.1752921535893155, + "grad_norm": 0.5690430402755737, + "learning_rate": 8.044952920927407e-05, + "loss": 1.6062, + "step": 3872 + }, + { + "epoch": 1.175595689786007, + "grad_norm": 0.5902601480484009, + "learning_rate": 8.044446694340387e-05, + "loss": 1.6706, + "step": 3873 + }, + { + "epoch": 1.1758992259826984, + "grad_norm": 0.4814835786819458, + "learning_rate": 8.043940467753366e-05, + "loss": 1.7218, + "step": 3874 + }, + { + "epoch": 1.1762027621793898, + "grad_norm": 0.49932724237442017, + "learning_rate": 8.043434241166347e-05, + "loss": 1.7839, + "step": 3875 + }, + { + "epoch": 1.1765062983760814, + "grad_norm": 0.48431655764579773, + "learning_rate": 8.042928014579327e-05, + "loss": 1.5341, + "step": 3876 + }, + { + "epoch": 1.1768098345727729, + "grad_norm": 0.5197260975837708, + "learning_rate": 8.042421787992306e-05, + "loss": 1.0566, + "step": 3877 + }, + { + "epoch": 1.1771133707694643, + "grad_norm": 0.578517735004425, + "learning_rate": 8.041915561405286e-05, + "loss": 1.4678, + "step": 3878 + }, + { + "epoch": 1.1774169069661558, + "grad_norm": 1.7475563287734985, + "learning_rate": 8.041409334818265e-05, + "loss": 1.3523, + "step": 3879 + }, + { + "epoch": 1.1777204431628472, + "grad_norm": 0.4969330132007599, + "learning_rate": 8.040903108231245e-05, + "loss": 1.7174, + "step": 3880 + }, + { + "epoch": 1.1780239793595386, + "grad_norm": 0.5173845291137695, + "learning_rate": 8.040396881644224e-05, + "loss": 1.7304, + "step": 3881 + }, + { + "epoch": 1.17832751555623, + "grad_norm": 0.5299842357635498, + "learning_rate": 8.039890655057204e-05, + "loss": 1.903, + "step": 3882 + }, + { + "epoch": 1.1786310517529215, + "grad_norm": 0.4468022286891937, + "learning_rate": 8.039384428470183e-05, + "loss": 1.8697, + "step": 3883 + }, + { + "epoch": 1.178934587949613, + "grad_norm": 0.5375956892967224, + "learning_rate": 8.038878201883163e-05, + "loss": 1.4277, + "step": 3884 + }, + { + "epoch": 1.1792381241463044, + "grad_norm": 0.5312685370445251, + "learning_rate": 8.038371975296143e-05, + "loss": 1.5446, + "step": 3885 + }, + { + "epoch": 1.1795416603429958, + "grad_norm": 0.4160996377468109, + "learning_rate": 8.037865748709123e-05, + "loss": 1.439, + "step": 3886 + }, + { + "epoch": 1.1798451965396874, + "grad_norm": 0.5316253900527954, + "learning_rate": 8.037359522122102e-05, + "loss": 1.6263, + "step": 3887 + }, + { + "epoch": 1.1801487327363789, + "grad_norm": 0.5926960706710815, + "learning_rate": 8.036853295535082e-05, + "loss": 1.2918, + "step": 3888 + }, + { + "epoch": 1.1804522689330703, + "grad_norm": 0.557610034942627, + "learning_rate": 8.036347068948061e-05, + "loss": 1.7301, + "step": 3889 + }, + { + "epoch": 1.1807558051297617, + "grad_norm": 0.5832757949829102, + "learning_rate": 8.035840842361041e-05, + "loss": 1.7146, + "step": 3890 + }, + { + "epoch": 1.1810593413264532, + "grad_norm": 0.6123656034469604, + "learning_rate": 8.03533461577402e-05, + "loss": 1.7001, + "step": 3891 + }, + { + "epoch": 1.1813628775231446, + "grad_norm": 0.912534773349762, + "learning_rate": 8.034828389187e-05, + "loss": 1.6857, + "step": 3892 + }, + { + "epoch": 1.181666413719836, + "grad_norm": 0.8999262452125549, + "learning_rate": 8.034322162599979e-05, + "loss": 1.6986, + "step": 3893 + }, + { + "epoch": 1.1819699499165275, + "grad_norm": 0.5878473520278931, + "learning_rate": 8.03381593601296e-05, + "loss": 1.596, + "step": 3894 + }, + { + "epoch": 1.182273486113219, + "grad_norm": 0.48312613368034363, + "learning_rate": 8.03330970942594e-05, + "loss": 1.6136, + "step": 3895 + }, + { + "epoch": 1.1825770223099106, + "grad_norm": 0.4754876494407654, + "learning_rate": 8.032803482838919e-05, + "loss": 1.5353, + "step": 3896 + }, + { + "epoch": 1.182880558506602, + "grad_norm": 0.5699150562286377, + "learning_rate": 8.032297256251898e-05, + "loss": 1.4489, + "step": 3897 + }, + { + "epoch": 1.1831840947032934, + "grad_norm": 0.407832533121109, + "learning_rate": 8.031791029664878e-05, + "loss": 1.7367, + "step": 3898 + }, + { + "epoch": 1.1834876308999849, + "grad_norm": 0.5520400404930115, + "learning_rate": 8.031284803077857e-05, + "loss": 1.9028, + "step": 3899 + }, + { + "epoch": 1.1837911670966763, + "grad_norm": 1.5105438232421875, + "learning_rate": 8.030778576490837e-05, + "loss": 1.6191, + "step": 3900 + }, + { + "epoch": 1.1840947032933677, + "grad_norm": 0.5375813245773315, + "learning_rate": 8.030272349903816e-05, + "loss": 1.7874, + "step": 3901 + }, + { + "epoch": 1.1843982394900592, + "grad_norm": 0.6227301955223083, + "learning_rate": 8.029766123316796e-05, + "loss": 1.6987, + "step": 3902 + }, + { + "epoch": 1.1847017756867506, + "grad_norm": 0.5316741466522217, + "learning_rate": 8.029259896729777e-05, + "loss": 1.4782, + "step": 3903 + }, + { + "epoch": 1.185005311883442, + "grad_norm": 0.5515217185020447, + "learning_rate": 8.028753670142756e-05, + "loss": 1.7657, + "step": 3904 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.5551007390022278, + "learning_rate": 8.028247443555737e-05, + "loss": 1.7397, + "step": 3905 + }, + { + "epoch": 1.185612384276825, + "grad_norm": 0.5252280831336975, + "learning_rate": 8.027741216968717e-05, + "loss": 1.5638, + "step": 3906 + }, + { + "epoch": 1.1859159204735166, + "grad_norm": 0.5458955764770508, + "learning_rate": 8.027234990381696e-05, + "loss": 1.1987, + "step": 3907 + }, + { + "epoch": 1.186219456670208, + "grad_norm": 0.4248373210430145, + "learning_rate": 8.026728763794675e-05, + "loss": 1.3968, + "step": 3908 + }, + { + "epoch": 1.1865229928668994, + "grad_norm": 0.5304285287857056, + "learning_rate": 8.026222537207655e-05, + "loss": 1.8042, + "step": 3909 + }, + { + "epoch": 1.1868265290635909, + "grad_norm": 0.5563675165176392, + "learning_rate": 8.025716310620634e-05, + "loss": 1.8037, + "step": 3910 + }, + { + "epoch": 1.1871300652602823, + "grad_norm": 0.5306220650672913, + "learning_rate": 8.025210084033614e-05, + "loss": 1.0585, + "step": 3911 + }, + { + "epoch": 1.1874336014569737, + "grad_norm": 0.8807549476623535, + "learning_rate": 8.024703857446593e-05, + "loss": 1.3612, + "step": 3912 + }, + { + "epoch": 1.1877371376536652, + "grad_norm": 0.8862515687942505, + "learning_rate": 8.024197630859573e-05, + "loss": 1.4855, + "step": 3913 + }, + { + "epoch": 1.1880406738503566, + "grad_norm": 0.4896228611469269, + "learning_rate": 8.023691404272554e-05, + "loss": 1.7299, + "step": 3914 + }, + { + "epoch": 1.188344210047048, + "grad_norm": 0.4608948230743408, + "learning_rate": 8.023185177685533e-05, + "loss": 1.7385, + "step": 3915 + }, + { + "epoch": 1.1886477462437395, + "grad_norm": 0.48187950253486633, + "learning_rate": 8.022678951098513e-05, + "loss": 1.8916, + "step": 3916 + }, + { + "epoch": 1.188951282440431, + "grad_norm": 0.679067075252533, + "learning_rate": 8.022172724511492e-05, + "loss": 1.6252, + "step": 3917 + }, + { + "epoch": 1.1892548186371226, + "grad_norm": 0.48127803206443787, + "learning_rate": 8.021666497924472e-05, + "loss": 1.5121, + "step": 3918 + }, + { + "epoch": 1.189558354833814, + "grad_norm": 0.5188263654708862, + "learning_rate": 8.021160271337451e-05, + "loss": 1.8135, + "step": 3919 + }, + { + "epoch": 1.1898618910305054, + "grad_norm": 0.49956321716308594, + "learning_rate": 8.02065404475043e-05, + "loss": 1.4235, + "step": 3920 + }, + { + "epoch": 1.1901654272271969, + "grad_norm": 0.5067212581634521, + "learning_rate": 8.02014781816341e-05, + "loss": 1.4532, + "step": 3921 + }, + { + "epoch": 1.1904689634238883, + "grad_norm": 0.4681207239627838, + "learning_rate": 8.01964159157639e-05, + "loss": 1.6127, + "step": 3922 + }, + { + "epoch": 1.1907724996205797, + "grad_norm": 3.2469677925109863, + "learning_rate": 8.019135364989369e-05, + "loss": 1.2453, + "step": 3923 + }, + { + "epoch": 1.1910760358172712, + "grad_norm": 0.4977457821369171, + "learning_rate": 8.01862913840235e-05, + "loss": 1.6088, + "step": 3924 + }, + { + "epoch": 1.1913795720139626, + "grad_norm": 0.4594128727912903, + "learning_rate": 8.01812291181533e-05, + "loss": 1.6691, + "step": 3925 + }, + { + "epoch": 1.191683108210654, + "grad_norm": 0.49090635776519775, + "learning_rate": 8.017616685228309e-05, + "loss": 1.6259, + "step": 3926 + }, + { + "epoch": 1.1919866444073457, + "grad_norm": 0.5628901124000549, + "learning_rate": 8.017110458641288e-05, + "loss": 1.4469, + "step": 3927 + }, + { + "epoch": 1.192290180604037, + "grad_norm": 0.5099067091941833, + "learning_rate": 8.016604232054268e-05, + "loss": 1.7828, + "step": 3928 + }, + { + "epoch": 1.1925937168007286, + "grad_norm": 0.546001672744751, + "learning_rate": 8.016098005467247e-05, + "loss": 1.9593, + "step": 3929 + }, + { + "epoch": 1.19289725299742, + "grad_norm": 0.5143636465072632, + "learning_rate": 8.015591778880227e-05, + "loss": 1.6765, + "step": 3930 + }, + { + "epoch": 1.1932007891941114, + "grad_norm": 0.5303293466567993, + "learning_rate": 8.015085552293206e-05, + "loss": 1.6472, + "step": 3931 + }, + { + "epoch": 1.1935043253908029, + "grad_norm": 0.5036451816558838, + "learning_rate": 8.014579325706186e-05, + "loss": 1.6347, + "step": 3932 + }, + { + "epoch": 1.1938078615874943, + "grad_norm": 0.5039635896682739, + "learning_rate": 8.014073099119167e-05, + "loss": 1.6958, + "step": 3933 + }, + { + "epoch": 1.1941113977841857, + "grad_norm": 1.2352403402328491, + "learning_rate": 8.013566872532146e-05, + "loss": 1.2393, + "step": 3934 + }, + { + "epoch": 1.1944149339808772, + "grad_norm": 0.4501217007637024, + "learning_rate": 8.013060645945125e-05, + "loss": 1.7043, + "step": 3935 + }, + { + "epoch": 1.1947184701775686, + "grad_norm": 0.47026047110557556, + "learning_rate": 8.012554419358105e-05, + "loss": 1.8155, + "step": 3936 + }, + { + "epoch": 1.19502200637426, + "grad_norm": 1.4304115772247314, + "learning_rate": 8.012048192771084e-05, + "loss": 1.4581, + "step": 3937 + }, + { + "epoch": 1.1953255425709517, + "grad_norm": 0.5405710339546204, + "learning_rate": 8.011541966184064e-05, + "loss": 1.1641, + "step": 3938 + }, + { + "epoch": 1.1956290787676431, + "grad_norm": 0.4712076187133789, + "learning_rate": 8.011035739597043e-05, + "loss": 1.7641, + "step": 3939 + }, + { + "epoch": 1.1959326149643346, + "grad_norm": 0.5050527453422546, + "learning_rate": 8.010529513010023e-05, + "loss": 1.6604, + "step": 3940 + }, + { + "epoch": 1.196236151161026, + "grad_norm": 0.5808800458908081, + "learning_rate": 8.010023286423002e-05, + "loss": 1.6771, + "step": 3941 + }, + { + "epoch": 1.1965396873577174, + "grad_norm": 0.5032142996788025, + "learning_rate": 8.009517059835983e-05, + "loss": 1.7387, + "step": 3942 + }, + { + "epoch": 1.1968432235544089, + "grad_norm": 0.5121293067932129, + "learning_rate": 8.009010833248963e-05, + "loss": 1.7244, + "step": 3943 + }, + { + "epoch": 1.1971467597511003, + "grad_norm": 0.5486367344856262, + "learning_rate": 8.008504606661942e-05, + "loss": 1.2352, + "step": 3944 + }, + { + "epoch": 1.1974502959477917, + "grad_norm": 0.4894542396068573, + "learning_rate": 8.007998380074922e-05, + "loss": 1.6186, + "step": 3945 + }, + { + "epoch": 1.1977538321444832, + "grad_norm": 0.5042988657951355, + "learning_rate": 8.007492153487901e-05, + "loss": 1.946, + "step": 3946 + }, + { + "epoch": 1.1980573683411746, + "grad_norm": 0.495233416557312, + "learning_rate": 8.00698592690088e-05, + "loss": 1.8769, + "step": 3947 + }, + { + "epoch": 1.198360904537866, + "grad_norm": 1.2553257942199707, + "learning_rate": 8.006479700313861e-05, + "loss": 1.2075, + "step": 3948 + }, + { + "epoch": 1.1986644407345577, + "grad_norm": 0.5711445808410645, + "learning_rate": 8.005973473726841e-05, + "loss": 1.5606, + "step": 3949 + }, + { + "epoch": 1.1989679769312491, + "grad_norm": 0.40884625911712646, + "learning_rate": 8.00546724713982e-05, + "loss": 1.6262, + "step": 3950 + }, + { + "epoch": 1.1992715131279406, + "grad_norm": 0.46294069290161133, + "learning_rate": 8.0049610205528e-05, + "loss": 1.9437, + "step": 3951 + }, + { + "epoch": 1.199575049324632, + "grad_norm": 0.43358656764030457, + "learning_rate": 8.00445479396578e-05, + "loss": 1.708, + "step": 3952 + }, + { + "epoch": 1.1998785855213234, + "grad_norm": 0.48382043838500977, + "learning_rate": 8.00394856737876e-05, + "loss": 1.3979, + "step": 3953 + }, + { + "epoch": 1.2001821217180149, + "grad_norm": 0.5665555596351624, + "learning_rate": 8.00344234079174e-05, + "loss": 1.6572, + "step": 3954 + }, + { + "epoch": 1.2004856579147063, + "grad_norm": 0.49451395869255066, + "learning_rate": 8.002936114204719e-05, + "loss": 1.781, + "step": 3955 + }, + { + "epoch": 1.2007891941113977, + "grad_norm": 0.47142118215560913, + "learning_rate": 8.002429887617699e-05, + "loss": 1.9088, + "step": 3956 + }, + { + "epoch": 1.2010927303080892, + "grad_norm": 0.470790833234787, + "learning_rate": 8.001923661030678e-05, + "loss": 1.0126, + "step": 3957 + }, + { + "epoch": 1.2013962665047808, + "grad_norm": 0.5409683585166931, + "learning_rate": 8.001417434443658e-05, + "loss": 1.6517, + "step": 3958 + }, + { + "epoch": 1.201699802701472, + "grad_norm": 0.5949118137359619, + "learning_rate": 8.000911207856637e-05, + "loss": 1.4266, + "step": 3959 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.48740145564079285, + "learning_rate": 8.000404981269617e-05, + "loss": 1.7951, + "step": 3960 + }, + { + "epoch": 1.2023068750948551, + "grad_norm": 0.5187662839889526, + "learning_rate": 7.999898754682596e-05, + "loss": 1.5534, + "step": 3961 + }, + { + "epoch": 1.2026104112915466, + "grad_norm": 0.48459750413894653, + "learning_rate": 7.999392528095575e-05, + "loss": 1.3518, + "step": 3962 + }, + { + "epoch": 1.202913947488238, + "grad_norm": 0.9907912015914917, + "learning_rate": 7.998886301508556e-05, + "loss": 1.3871, + "step": 3963 + }, + { + "epoch": 1.2032174836849294, + "grad_norm": 0.460400253534317, + "learning_rate": 7.998380074921536e-05, + "loss": 1.8961, + "step": 3964 + }, + { + "epoch": 1.2035210198816209, + "grad_norm": 0.480882853269577, + "learning_rate": 7.997873848334515e-05, + "loss": 1.5589, + "step": 3965 + }, + { + "epoch": 1.2038245560783123, + "grad_norm": 0.5009549260139465, + "learning_rate": 7.997367621747495e-05, + "loss": 1.6373, + "step": 3966 + }, + { + "epoch": 1.2041280922750037, + "grad_norm": 0.5043389797210693, + "learning_rate": 7.996861395160474e-05, + "loss": 1.4056, + "step": 3967 + }, + { + "epoch": 1.2044316284716952, + "grad_norm": 0.5639089941978455, + "learning_rate": 7.996355168573454e-05, + "loss": 1.7575, + "step": 3968 + }, + { + "epoch": 1.2047351646683868, + "grad_norm": 0.43601855635643005, + "learning_rate": 7.995848941986433e-05, + "loss": 0.9839, + "step": 3969 + }, + { + "epoch": 1.2050387008650782, + "grad_norm": 0.4941728711128235, + "learning_rate": 7.995342715399413e-05, + "loss": 1.7745, + "step": 3970 + }, + { + "epoch": 1.2053422370617697, + "grad_norm": 0.5130265355110168, + "learning_rate": 7.994836488812392e-05, + "loss": 1.6205, + "step": 3971 + }, + { + "epoch": 1.2056457732584611, + "grad_norm": 0.4736790060997009, + "learning_rate": 7.994330262225373e-05, + "loss": 1.728, + "step": 3972 + }, + { + "epoch": 1.2059493094551526, + "grad_norm": 0.4924636781215668, + "learning_rate": 7.993824035638352e-05, + "loss": 1.6651, + "step": 3973 + }, + { + "epoch": 1.206252845651844, + "grad_norm": 0.4965018332004547, + "learning_rate": 7.993317809051332e-05, + "loss": 1.7473, + "step": 3974 + }, + { + "epoch": 1.2065563818485354, + "grad_norm": 0.5033332705497742, + "learning_rate": 7.992811582464311e-05, + "loss": 1.8963, + "step": 3975 + }, + { + "epoch": 1.2068599180452269, + "grad_norm": 0.5527442097663879, + "learning_rate": 7.992305355877291e-05, + "loss": 1.4068, + "step": 3976 + }, + { + "epoch": 1.2071634542419183, + "grad_norm": 0.6062281131744385, + "learning_rate": 7.99179912929027e-05, + "loss": 1.8115, + "step": 3977 + }, + { + "epoch": 1.2074669904386097, + "grad_norm": 0.6154857873916626, + "learning_rate": 7.99129290270325e-05, + "loss": 1.4765, + "step": 3978 + }, + { + "epoch": 1.2077705266353012, + "grad_norm": 0.49860256910324097, + "learning_rate": 7.99078667611623e-05, + "loss": 1.6141, + "step": 3979 + }, + { + "epoch": 1.2080740628319928, + "grad_norm": 0.5555553436279297, + "learning_rate": 7.990280449529209e-05, + "loss": 1.4785, + "step": 3980 + }, + { + "epoch": 1.2083775990286842, + "grad_norm": 1.548555612564087, + "learning_rate": 7.98977422294219e-05, + "loss": 1.6227, + "step": 3981 + }, + { + "epoch": 1.2086811352253757, + "grad_norm": 0.42255523800849915, + "learning_rate": 7.989267996355169e-05, + "loss": 1.128, + "step": 3982 + }, + { + "epoch": 1.2089846714220671, + "grad_norm": 0.4955081343650818, + "learning_rate": 7.988761769768149e-05, + "loss": 1.742, + "step": 3983 + }, + { + "epoch": 1.2092882076187585, + "grad_norm": 0.4081752598285675, + "learning_rate": 7.988255543181128e-05, + "loss": 1.3424, + "step": 3984 + }, + { + "epoch": 1.20959174381545, + "grad_norm": 0.6145724654197693, + "learning_rate": 7.987749316594108e-05, + "loss": 1.7208, + "step": 3985 + }, + { + "epoch": 1.2098952800121414, + "grad_norm": 0.5161190629005432, + "learning_rate": 7.987243090007087e-05, + "loss": 1.2054, + "step": 3986 + }, + { + "epoch": 1.2101988162088329, + "grad_norm": 0.4951912462711334, + "learning_rate": 7.986736863420067e-05, + "loss": 1.6643, + "step": 3987 + }, + { + "epoch": 1.2105023524055243, + "grad_norm": 0.4547387957572937, + "learning_rate": 7.986230636833046e-05, + "loss": 1.6728, + "step": 3988 + }, + { + "epoch": 1.2108058886022157, + "grad_norm": 0.5978291630744934, + "learning_rate": 7.985724410246025e-05, + "loss": 1.5965, + "step": 3989 + }, + { + "epoch": 1.2111094247989072, + "grad_norm": 0.48494064807891846, + "learning_rate": 7.985218183659005e-05, + "loss": 1.7527, + "step": 3990 + }, + { + "epoch": 1.2114129609955988, + "grad_norm": 0.5768218636512756, + "learning_rate": 7.984711957071986e-05, + "loss": 1.5072, + "step": 3991 + }, + { + "epoch": 1.2117164971922902, + "grad_norm": 0.4511902928352356, + "learning_rate": 7.984205730484967e-05, + "loss": 1.8114, + "step": 3992 + }, + { + "epoch": 1.2120200333889817, + "grad_norm": 0.47839030623435974, + "learning_rate": 7.983699503897946e-05, + "loss": 1.682, + "step": 3993 + }, + { + "epoch": 1.212323569585673, + "grad_norm": 0.5301209688186646, + "learning_rate": 7.983193277310926e-05, + "loss": 1.9935, + "step": 3994 + }, + { + "epoch": 1.2126271057823645, + "grad_norm": 0.4306343197822571, + "learning_rate": 7.982687050723905e-05, + "loss": 1.2576, + "step": 3995 + }, + { + "epoch": 1.212930641979056, + "grad_norm": 0.7651235461235046, + "learning_rate": 7.982180824136885e-05, + "loss": 1.7594, + "step": 3996 + }, + { + "epoch": 1.2132341781757474, + "grad_norm": 1.4882827997207642, + "learning_rate": 7.981674597549864e-05, + "loss": 1.565, + "step": 3997 + }, + { + "epoch": 1.2135377143724388, + "grad_norm": 0.6777023673057556, + "learning_rate": 7.981168370962844e-05, + "loss": 1.3189, + "step": 3998 + }, + { + "epoch": 1.2138412505691303, + "grad_norm": 0.8454605937004089, + "learning_rate": 7.980662144375823e-05, + "loss": 1.1823, + "step": 3999 + }, + { + "epoch": 1.214144786765822, + "grad_norm": 0.5341261625289917, + "learning_rate": 7.980155917788802e-05, + "loss": 1.6943, + "step": 4000 + }, + { + "epoch": 1.2144483229625134, + "grad_norm": 0.44602447748184204, + "learning_rate": 7.979649691201782e-05, + "loss": 1.658, + "step": 4001 + }, + { + "epoch": 1.2147518591592048, + "grad_norm": 0.4305332899093628, + "learning_rate": 7.979143464614763e-05, + "loss": 1.9482, + "step": 4002 + }, + { + "epoch": 1.2150553953558962, + "grad_norm": 0.8739070892333984, + "learning_rate": 7.978637238027742e-05, + "loss": 0.7679, + "step": 4003 + }, + { + "epoch": 1.2153589315525877, + "grad_norm": 1.0856356620788574, + "learning_rate": 7.978131011440722e-05, + "loss": 1.9586, + "step": 4004 + }, + { + "epoch": 1.215662467749279, + "grad_norm": 0.5733173489570618, + "learning_rate": 7.977624784853701e-05, + "loss": 1.8098, + "step": 4005 + }, + { + "epoch": 1.2159660039459705, + "grad_norm": 0.5946094989776611, + "learning_rate": 7.977118558266681e-05, + "loss": 1.7604, + "step": 4006 + }, + { + "epoch": 1.216269540142662, + "grad_norm": 0.4725922644138336, + "learning_rate": 7.97661233167966e-05, + "loss": 2.0565, + "step": 4007 + }, + { + "epoch": 1.2165730763393534, + "grad_norm": 0.49736130237579346, + "learning_rate": 7.97610610509264e-05, + "loss": 1.8367, + "step": 4008 + }, + { + "epoch": 1.2168766125360448, + "grad_norm": 0.4991004467010498, + "learning_rate": 7.975599878505619e-05, + "loss": 1.6324, + "step": 4009 + }, + { + "epoch": 1.2171801487327363, + "grad_norm": 0.4400973618030548, + "learning_rate": 7.975093651918599e-05, + "loss": 1.2879, + "step": 4010 + }, + { + "epoch": 1.217483684929428, + "grad_norm": 0.5288736820220947, + "learning_rate": 7.97458742533158e-05, + "loss": 1.7851, + "step": 4011 + }, + { + "epoch": 1.2177872211261194, + "grad_norm": 0.5546556115150452, + "learning_rate": 7.974081198744559e-05, + "loss": 1.5529, + "step": 4012 + }, + { + "epoch": 1.2180907573228108, + "grad_norm": 0.6062618494033813, + "learning_rate": 7.973574972157538e-05, + "loss": 1.8191, + "step": 4013 + }, + { + "epoch": 1.2183942935195022, + "grad_norm": 0.5226564407348633, + "learning_rate": 7.973068745570518e-05, + "loss": 1.8121, + "step": 4014 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.5904191732406616, + "learning_rate": 7.972562518983497e-05, + "loss": 1.7177, + "step": 4015 + }, + { + "epoch": 1.219001365912885, + "grad_norm": 0.7053862810134888, + "learning_rate": 7.972056292396477e-05, + "loss": 1.2042, + "step": 4016 + }, + { + "epoch": 1.2193049021095765, + "grad_norm": 0.4357145130634308, + "learning_rate": 7.971550065809456e-05, + "loss": 1.3318, + "step": 4017 + }, + { + "epoch": 1.219608438306268, + "grad_norm": 0.5167221426963806, + "learning_rate": 7.971043839222436e-05, + "loss": 1.3981, + "step": 4018 + }, + { + "epoch": 1.2199119745029594, + "grad_norm": 0.5844317078590393, + "learning_rate": 7.970537612635415e-05, + "loss": 1.5698, + "step": 4019 + }, + { + "epoch": 1.2202155106996508, + "grad_norm": 0.47601571679115295, + "learning_rate": 7.970031386048396e-05, + "loss": 1.6672, + "step": 4020 + }, + { + "epoch": 1.2205190468963423, + "grad_norm": 0.5375113487243652, + "learning_rate": 7.969525159461376e-05, + "loss": 1.8034, + "step": 4021 + }, + { + "epoch": 1.220822583093034, + "grad_norm": 0.5123186111450195, + "learning_rate": 7.969018932874355e-05, + "loss": 1.9045, + "step": 4022 + }, + { + "epoch": 1.2211261192897254, + "grad_norm": 0.5284671187400818, + "learning_rate": 7.968512706287335e-05, + "loss": 1.5642, + "step": 4023 + }, + { + "epoch": 1.2214296554864168, + "grad_norm": 0.47770121693611145, + "learning_rate": 7.968006479700314e-05, + "loss": 1.4129, + "step": 4024 + }, + { + "epoch": 1.2217331916831082, + "grad_norm": 0.5404722094535828, + "learning_rate": 7.967500253113294e-05, + "loss": 1.8306, + "step": 4025 + }, + { + "epoch": 1.2220367278797997, + "grad_norm": 0.4636070430278778, + "learning_rate": 7.966994026526273e-05, + "loss": 1.9329, + "step": 4026 + }, + { + "epoch": 1.222340264076491, + "grad_norm": 0.4419996440410614, + "learning_rate": 7.966487799939252e-05, + "loss": 1.2901, + "step": 4027 + }, + { + "epoch": 1.2226438002731825, + "grad_norm": 0.9507476091384888, + "learning_rate": 7.965981573352232e-05, + "loss": 1.5751, + "step": 4028 + }, + { + "epoch": 1.222947336469874, + "grad_norm": 0.6683091521263123, + "learning_rate": 7.965475346765211e-05, + "loss": 1.9538, + "step": 4029 + }, + { + "epoch": 1.2232508726665654, + "grad_norm": 0.5125116109848022, + "learning_rate": 7.964969120178192e-05, + "loss": 1.7736, + "step": 4030 + }, + { + "epoch": 1.223554408863257, + "grad_norm": 0.5630882978439331, + "learning_rate": 7.964462893591172e-05, + "loss": 1.4199, + "step": 4031 + }, + { + "epoch": 1.2238579450599485, + "grad_norm": 0.5021055340766907, + "learning_rate": 7.963956667004151e-05, + "loss": 1.7814, + "step": 4032 + }, + { + "epoch": 1.22416148125664, + "grad_norm": 0.6553727984428406, + "learning_rate": 7.963450440417131e-05, + "loss": 1.8245, + "step": 4033 + }, + { + "epoch": 1.2244650174533314, + "grad_norm": 0.359862357378006, + "learning_rate": 7.96294421383011e-05, + "loss": 1.4599, + "step": 4034 + }, + { + "epoch": 1.2247685536500228, + "grad_norm": 0.541898787021637, + "learning_rate": 7.96243798724309e-05, + "loss": 1.7938, + "step": 4035 + }, + { + "epoch": 1.2250720898467142, + "grad_norm": 0.5998459458351135, + "learning_rate": 7.961931760656069e-05, + "loss": 2.0052, + "step": 4036 + }, + { + "epoch": 1.2253756260434057, + "grad_norm": 0.5199744701385498, + "learning_rate": 7.96142553406905e-05, + "loss": 1.6126, + "step": 4037 + }, + { + "epoch": 1.225679162240097, + "grad_norm": 0.6697866916656494, + "learning_rate": 7.96091930748203e-05, + "loss": 1.4301, + "step": 4038 + }, + { + "epoch": 1.2259826984367885, + "grad_norm": 0.5745710730552673, + "learning_rate": 7.960413080895009e-05, + "loss": 1.7098, + "step": 4039 + }, + { + "epoch": 1.22628623463348, + "grad_norm": 0.5066909193992615, + "learning_rate": 7.959906854307988e-05, + "loss": 2.1108, + "step": 4040 + }, + { + "epoch": 1.2265897708301714, + "grad_norm": 0.526587963104248, + "learning_rate": 7.959400627720969e-05, + "loss": 1.555, + "step": 4041 + }, + { + "epoch": 1.226893307026863, + "grad_norm": 0.5360167622566223, + "learning_rate": 7.958894401133949e-05, + "loss": 1.7745, + "step": 4042 + }, + { + "epoch": 1.2271968432235545, + "grad_norm": 0.5217559337615967, + "learning_rate": 7.958388174546928e-05, + "loss": 1.7375, + "step": 4043 + }, + { + "epoch": 1.227500379420246, + "grad_norm": 0.48961344361305237, + "learning_rate": 7.957881947959908e-05, + "loss": 1.7585, + "step": 4044 + }, + { + "epoch": 1.2278039156169374, + "grad_norm": 0.469046413898468, + "learning_rate": 7.957375721372887e-05, + "loss": 1.7162, + "step": 4045 + }, + { + "epoch": 1.2281074518136288, + "grad_norm": 0.5398122072219849, + "learning_rate": 7.956869494785867e-05, + "loss": 1.4512, + "step": 4046 + }, + { + "epoch": 1.2284109880103202, + "grad_norm": 0.615802526473999, + "learning_rate": 7.956363268198846e-05, + "loss": 1.9028, + "step": 4047 + }, + { + "epoch": 1.2287145242070117, + "grad_norm": 0.8024721741676331, + "learning_rate": 7.955857041611826e-05, + "loss": 1.6869, + "step": 4048 + }, + { + "epoch": 1.229018060403703, + "grad_norm": 0.4279814064502716, + "learning_rate": 7.955350815024805e-05, + "loss": 1.6522, + "step": 4049 + }, + { + "epoch": 1.2293215966003945, + "grad_norm": 0.5428003668785095, + "learning_rate": 7.954844588437786e-05, + "loss": 0.5135, + "step": 4050 + }, + { + "epoch": 1.229625132797086, + "grad_norm": 0.5030322670936584, + "learning_rate": 7.954338361850765e-05, + "loss": 1.6959, + "step": 4051 + }, + { + "epoch": 1.2299286689937774, + "grad_norm": 0.5142816305160522, + "learning_rate": 7.953832135263745e-05, + "loss": 0.9787, + "step": 4052 + }, + { + "epoch": 1.230232205190469, + "grad_norm": 0.476615309715271, + "learning_rate": 7.953325908676724e-05, + "loss": 1.5626, + "step": 4053 + }, + { + "epoch": 1.2305357413871605, + "grad_norm": 0.6307333111763, + "learning_rate": 7.952819682089704e-05, + "loss": 1.1121, + "step": 4054 + }, + { + "epoch": 1.230839277583852, + "grad_norm": 0.6847572922706604, + "learning_rate": 7.952313455502683e-05, + "loss": 1.2124, + "step": 4055 + }, + { + "epoch": 1.2311428137805434, + "grad_norm": 0.6222221851348877, + "learning_rate": 7.951807228915663e-05, + "loss": 1.7183, + "step": 4056 + }, + { + "epoch": 1.2314463499772348, + "grad_norm": 0.5753226280212402, + "learning_rate": 7.951301002328642e-05, + "loss": 1.1929, + "step": 4057 + }, + { + "epoch": 1.2317498861739262, + "grad_norm": 0.5416280031204224, + "learning_rate": 7.950794775741622e-05, + "loss": 1.7334, + "step": 4058 + }, + { + "epoch": 1.2320534223706177, + "grad_norm": 0.555026113986969, + "learning_rate": 7.950288549154603e-05, + "loss": 1.308, + "step": 4059 + }, + { + "epoch": 1.232356958567309, + "grad_norm": 0.5982434153556824, + "learning_rate": 7.949782322567582e-05, + "loss": 1.5865, + "step": 4060 + }, + { + "epoch": 1.2326604947640005, + "grad_norm": 0.5335420370101929, + "learning_rate": 7.949276095980562e-05, + "loss": 1.2918, + "step": 4061 + }, + { + "epoch": 1.2329640309606922, + "grad_norm": 0.4593915343284607, + "learning_rate": 7.948769869393541e-05, + "loss": 1.4323, + "step": 4062 + }, + { + "epoch": 1.2332675671573834, + "grad_norm": 0.4725258946418762, + "learning_rate": 7.94826364280652e-05, + "loss": 1.4033, + "step": 4063 + }, + { + "epoch": 1.233571103354075, + "grad_norm": 0.4811660945415497, + "learning_rate": 7.9477574162195e-05, + "loss": 1.9863, + "step": 4064 + }, + { + "epoch": 1.2338746395507665, + "grad_norm": 0.6393853425979614, + "learning_rate": 7.94725118963248e-05, + "loss": 1.7669, + "step": 4065 + }, + { + "epoch": 1.234178175747458, + "grad_norm": 0.5385059118270874, + "learning_rate": 7.946744963045459e-05, + "loss": 1.7176, + "step": 4066 + }, + { + "epoch": 1.2344817119441494, + "grad_norm": 0.5656344890594482, + "learning_rate": 7.946238736458438e-05, + "loss": 1.1107, + "step": 4067 + }, + { + "epoch": 1.2347852481408408, + "grad_norm": 0.5419704914093018, + "learning_rate": 7.945732509871418e-05, + "loss": 1.609, + "step": 4068 + }, + { + "epoch": 1.2350887843375322, + "grad_norm": 0.5997708439826965, + "learning_rate": 7.945226283284399e-05, + "loss": 1.4907, + "step": 4069 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.6861248016357422, + "learning_rate": 7.944720056697378e-05, + "loss": 1.2756, + "step": 4070 + }, + { + "epoch": 1.235695856730915, + "grad_norm": 0.6528524160385132, + "learning_rate": 7.944213830110358e-05, + "loss": 1.3925, + "step": 4071 + }, + { + "epoch": 1.2359993929276065, + "grad_norm": 0.5758741497993469, + "learning_rate": 7.943707603523337e-05, + "loss": 1.6849, + "step": 4072 + }, + { + "epoch": 1.2363029291242982, + "grad_norm": 0.4997986853122711, + "learning_rate": 7.943201376936317e-05, + "loss": 1.698, + "step": 4073 + }, + { + "epoch": 1.2366064653209896, + "grad_norm": 0.48319289088249207, + "learning_rate": 7.942695150349296e-05, + "loss": 1.7681, + "step": 4074 + }, + { + "epoch": 1.236910001517681, + "grad_norm": 0.6239420771598816, + "learning_rate": 7.942188923762276e-05, + "loss": 1.417, + "step": 4075 + }, + { + "epoch": 1.2372135377143725, + "grad_norm": 0.33963650465011597, + "learning_rate": 7.941682697175255e-05, + "loss": 1.2974, + "step": 4076 + }, + { + "epoch": 1.237517073911064, + "grad_norm": 0.9823358654975891, + "learning_rate": 7.941176470588235e-05, + "loss": 1.3348, + "step": 4077 + }, + { + "epoch": 1.2378206101077553, + "grad_norm": 0.5259244441986084, + "learning_rate": 7.940670244001215e-05, + "loss": 1.735, + "step": 4078 + }, + { + "epoch": 1.2381241463044468, + "grad_norm": 0.48676925897598267, + "learning_rate": 7.940164017414195e-05, + "loss": 1.8874, + "step": 4079 + }, + { + "epoch": 1.2384276825011382, + "grad_norm": 0.4340936243534088, + "learning_rate": 7.939657790827174e-05, + "loss": 1.8046, + "step": 4080 + }, + { + "epoch": 1.2387312186978297, + "grad_norm": 0.5821318030357361, + "learning_rate": 7.939151564240155e-05, + "loss": 1.5047, + "step": 4081 + }, + { + "epoch": 1.239034754894521, + "grad_norm": 0.5104279518127441, + "learning_rate": 7.938645337653135e-05, + "loss": 1.5912, + "step": 4082 + }, + { + "epoch": 1.2393382910912125, + "grad_norm": 0.511009693145752, + "learning_rate": 7.938139111066114e-05, + "loss": 1.7989, + "step": 4083 + }, + { + "epoch": 1.2396418272879042, + "grad_norm": 0.5314620137214661, + "learning_rate": 7.937632884479094e-05, + "loss": 1.7351, + "step": 4084 + }, + { + "epoch": 1.2399453634845956, + "grad_norm": 0.5641986131668091, + "learning_rate": 7.937126657892073e-05, + "loss": 1.2645, + "step": 4085 + }, + { + "epoch": 1.240248899681287, + "grad_norm": 0.4240927994251251, + "learning_rate": 7.936620431305053e-05, + "loss": 1.9579, + "step": 4086 + }, + { + "epoch": 1.2405524358779785, + "grad_norm": 0.5414063334465027, + "learning_rate": 7.936114204718032e-05, + "loss": 1.7812, + "step": 4087 + }, + { + "epoch": 1.24085597207467, + "grad_norm": 0.5069623589515686, + "learning_rate": 7.935607978131012e-05, + "loss": 1.3929, + "step": 4088 + }, + { + "epoch": 1.2411595082713613, + "grad_norm": 0.5139201879501343, + "learning_rate": 7.935101751543992e-05, + "loss": 1.757, + "step": 4089 + }, + { + "epoch": 1.2414630444680528, + "grad_norm": 0.6100741624832153, + "learning_rate": 7.934595524956972e-05, + "loss": 1.328, + "step": 4090 + }, + { + "epoch": 1.2417665806647442, + "grad_norm": 0.54306960105896, + "learning_rate": 7.934089298369951e-05, + "loss": 1.3427, + "step": 4091 + }, + { + "epoch": 1.2420701168614356, + "grad_norm": 0.6219137907028198, + "learning_rate": 7.933583071782931e-05, + "loss": 1.9352, + "step": 4092 + }, + { + "epoch": 1.2423736530581273, + "grad_norm": 0.5613607168197632, + "learning_rate": 7.93307684519591e-05, + "loss": 1.775, + "step": 4093 + }, + { + "epoch": 1.2426771892548185, + "grad_norm": 0.4179461896419525, + "learning_rate": 7.93257061860889e-05, + "loss": 1.8111, + "step": 4094 + }, + { + "epoch": 1.2429807254515102, + "grad_norm": 0.469662606716156, + "learning_rate": 7.932064392021869e-05, + "loss": 1.6524, + "step": 4095 + }, + { + "epoch": 1.2432842616482016, + "grad_norm": 0.5118122696876526, + "learning_rate": 7.931558165434849e-05, + "loss": 1.6265, + "step": 4096 + }, + { + "epoch": 1.243587797844893, + "grad_norm": 0.5353027582168579, + "learning_rate": 7.931051938847828e-05, + "loss": 1.7069, + "step": 4097 + }, + { + "epoch": 1.2438913340415845, + "grad_norm": 0.4309144914150238, + "learning_rate": 7.930545712260809e-05, + "loss": 1.9461, + "step": 4098 + }, + { + "epoch": 1.244194870238276, + "grad_norm": 0.5440289378166199, + "learning_rate": 7.930039485673789e-05, + "loss": 1.7768, + "step": 4099 + }, + { + "epoch": 1.2444984064349673, + "grad_norm": 0.8079060912132263, + "learning_rate": 7.929533259086768e-05, + "loss": 1.764, + "step": 4100 + }, + { + "epoch": 1.2448019426316588, + "grad_norm": 0.48742911219596863, + "learning_rate": 7.929027032499748e-05, + "loss": 1.7685, + "step": 4101 + }, + { + "epoch": 1.2451054788283502, + "grad_norm": 0.4479953944683075, + "learning_rate": 7.928520805912727e-05, + "loss": 1.3186, + "step": 4102 + }, + { + "epoch": 1.2454090150250416, + "grad_norm": 0.730973482131958, + "learning_rate": 7.928014579325706e-05, + "loss": 1.5752, + "step": 4103 + }, + { + "epoch": 1.2457125512217333, + "grad_norm": 0.5573110580444336, + "learning_rate": 7.927508352738686e-05, + "loss": 1.7444, + "step": 4104 + }, + { + "epoch": 1.2460160874184247, + "grad_norm": 0.5198193192481995, + "learning_rate": 7.927002126151665e-05, + "loss": 1.6983, + "step": 4105 + }, + { + "epoch": 1.2463196236151162, + "grad_norm": 0.7842777967453003, + "learning_rate": 7.926495899564645e-05, + "loss": 1.4516, + "step": 4106 + }, + { + "epoch": 1.2466231598118076, + "grad_norm": 0.7621930837631226, + "learning_rate": 7.925989672977624e-05, + "loss": 1.8484, + "step": 4107 + }, + { + "epoch": 1.246926696008499, + "grad_norm": 0.5683858394622803, + "learning_rate": 7.925483446390605e-05, + "loss": 1.881, + "step": 4108 + }, + { + "epoch": 1.2472302322051905, + "grad_norm": 0.5511360764503479, + "learning_rate": 7.924977219803585e-05, + "loss": 1.7022, + "step": 4109 + }, + { + "epoch": 1.247533768401882, + "grad_norm": 0.5645896196365356, + "learning_rate": 7.924470993216564e-05, + "loss": 1.8527, + "step": 4110 + }, + { + "epoch": 1.2478373045985733, + "grad_norm": 0.5634721517562866, + "learning_rate": 7.923964766629544e-05, + "loss": 1.7056, + "step": 4111 + }, + { + "epoch": 1.2481408407952648, + "grad_norm": 0.525572657585144, + "learning_rate": 7.923458540042523e-05, + "loss": 1.6768, + "step": 4112 + }, + { + "epoch": 1.2484443769919562, + "grad_norm": 0.49522659182548523, + "learning_rate": 7.922952313455503e-05, + "loss": 1.689, + "step": 4113 + }, + { + "epoch": 1.2487479131886476, + "grad_norm": 0.48527786135673523, + "learning_rate": 7.922446086868482e-05, + "loss": 1.4007, + "step": 4114 + }, + { + "epoch": 1.2490514493853393, + "grad_norm": 0.7304596900939941, + "learning_rate": 7.921939860281462e-05, + "loss": 1.4368, + "step": 4115 + }, + { + "epoch": 1.2493549855820307, + "grad_norm": 0.5543166995048523, + "learning_rate": 7.921433633694441e-05, + "loss": 1.5934, + "step": 4116 + }, + { + "epoch": 1.2496585217787222, + "grad_norm": 0.5324878692626953, + "learning_rate": 7.920927407107422e-05, + "loss": 1.6429, + "step": 4117 + }, + { + "epoch": 1.2499620579754136, + "grad_norm": 0.48488008975982666, + "learning_rate": 7.920421180520401e-05, + "loss": 1.281, + "step": 4118 + }, + { + "epoch": 1.250265594172105, + "grad_norm": 0.5011441111564636, + "learning_rate": 7.919914953933381e-05, + "loss": 1.4652, + "step": 4119 + }, + { + "epoch": 1.2505691303687965, + "grad_norm": 0.45932215452194214, + "learning_rate": 7.91940872734636e-05, + "loss": 1.8158, + "step": 4120 + }, + { + "epoch": 1.250872666565488, + "grad_norm": 0.5459942817687988, + "learning_rate": 7.91890250075934e-05, + "loss": 1.4303, + "step": 4121 + }, + { + "epoch": 1.2511762027621793, + "grad_norm": 0.5746182799339294, + "learning_rate": 7.918396274172319e-05, + "loss": 0.8716, + "step": 4122 + }, + { + "epoch": 1.2514797389588708, + "grad_norm": 0.5529314875602722, + "learning_rate": 7.917890047585299e-05, + "loss": 1.7586, + "step": 4123 + }, + { + "epoch": 1.2517832751555624, + "grad_norm": 0.5376235842704773, + "learning_rate": 7.917383820998278e-05, + "loss": 1.3224, + "step": 4124 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.5033860206604004, + "learning_rate": 7.916877594411258e-05, + "loss": 1.612, + "step": 4125 + }, + { + "epoch": 1.2523903475489453, + "grad_norm": 0.8470841646194458, + "learning_rate": 7.916371367824239e-05, + "loss": 0.9943, + "step": 4126 + }, + { + "epoch": 1.2526938837456367, + "grad_norm": 0.5008292198181152, + "learning_rate": 7.915865141237218e-05, + "loss": 1.6873, + "step": 4127 + }, + { + "epoch": 1.2529974199423282, + "grad_norm": 0.47267603874206543, + "learning_rate": 7.915358914650199e-05, + "loss": 1.7586, + "step": 4128 + }, + { + "epoch": 1.2533009561390196, + "grad_norm": 0.43899428844451904, + "learning_rate": 7.914852688063178e-05, + "loss": 1.1282, + "step": 4129 + }, + { + "epoch": 1.253604492335711, + "grad_norm": 0.4466108977794647, + "learning_rate": 7.914346461476158e-05, + "loss": 1.9044, + "step": 4130 + }, + { + "epoch": 1.2539080285324025, + "grad_norm": 0.4737585783004761, + "learning_rate": 7.913840234889137e-05, + "loss": 1.6987, + "step": 4131 + }, + { + "epoch": 1.254211564729094, + "grad_norm": 0.4812822937965393, + "learning_rate": 7.913334008302117e-05, + "loss": 0.8285, + "step": 4132 + }, + { + "epoch": 1.2545151009257853, + "grad_norm": 0.5936858654022217, + "learning_rate": 7.912827781715096e-05, + "loss": 1.5008, + "step": 4133 + }, + { + "epoch": 1.2548186371224768, + "grad_norm": 0.6072853207588196, + "learning_rate": 7.912321555128076e-05, + "loss": 1.2776, + "step": 4134 + }, + { + "epoch": 1.2551221733191684, + "grad_norm": 0.6064707636833191, + "learning_rate": 7.911815328541055e-05, + "loss": 2.0275, + "step": 4135 + }, + { + "epoch": 1.2554257095158596, + "grad_norm": 0.6108651757240295, + "learning_rate": 7.911309101954035e-05, + "loss": 1.5007, + "step": 4136 + }, + { + "epoch": 1.2557292457125513, + "grad_norm": 0.47926777601242065, + "learning_rate": 7.910802875367016e-05, + "loss": 1.5429, + "step": 4137 + }, + { + "epoch": 1.2560327819092427, + "grad_norm": 0.4983449876308441, + "learning_rate": 7.910296648779995e-05, + "loss": 1.7559, + "step": 4138 + }, + { + "epoch": 1.2563363181059342, + "grad_norm": 0.5694484710693359, + "learning_rate": 7.909790422192975e-05, + "loss": 1.9243, + "step": 4139 + }, + { + "epoch": 1.2566398543026256, + "grad_norm": 1.9593089818954468, + "learning_rate": 7.909284195605954e-05, + "loss": 1.1921, + "step": 4140 + }, + { + "epoch": 1.256943390499317, + "grad_norm": 0.4247440993785858, + "learning_rate": 7.908777969018933e-05, + "loss": 1.1906, + "step": 4141 + }, + { + "epoch": 1.2572469266960085, + "grad_norm": 0.4611380398273468, + "learning_rate": 7.908271742431913e-05, + "loss": 1.7775, + "step": 4142 + }, + { + "epoch": 1.2575504628927, + "grad_norm": 0.5350640416145325, + "learning_rate": 7.907765515844892e-05, + "loss": 1.656, + "step": 4143 + }, + { + "epoch": 1.2578539990893913, + "grad_norm": 0.5389347076416016, + "learning_rate": 7.907259289257872e-05, + "loss": 1.9316, + "step": 4144 + }, + { + "epoch": 1.2581575352860828, + "grad_norm": 0.7473251819610596, + "learning_rate": 7.906753062670851e-05, + "loss": 1.0278, + "step": 4145 + }, + { + "epoch": 1.2584610714827744, + "grad_norm": 0.6106333136558533, + "learning_rate": 7.906246836083831e-05, + "loss": 1.4104, + "step": 4146 + }, + { + "epoch": 1.2587646076794659, + "grad_norm": 0.5932298898696899, + "learning_rate": 7.905740609496812e-05, + "loss": 1.5366, + "step": 4147 + }, + { + "epoch": 1.2590681438761573, + "grad_norm": 0.6722679138183594, + "learning_rate": 7.905234382909791e-05, + "loss": 1.6094, + "step": 4148 + }, + { + "epoch": 1.2593716800728487, + "grad_norm": 0.5468285083770752, + "learning_rate": 7.90472815632277e-05, + "loss": 1.8556, + "step": 4149 + }, + { + "epoch": 1.2596752162695402, + "grad_norm": 0.6329994201660156, + "learning_rate": 7.90422192973575e-05, + "loss": 1.7896, + "step": 4150 + }, + { + "epoch": 1.2599787524662316, + "grad_norm": 0.4546709358692169, + "learning_rate": 7.90371570314873e-05, + "loss": 1.7365, + "step": 4151 + }, + { + "epoch": 1.260282288662923, + "grad_norm": 1.1800211668014526, + "learning_rate": 7.903209476561709e-05, + "loss": 1.6131, + "step": 4152 + }, + { + "epoch": 1.2605858248596145, + "grad_norm": 0.6181531548500061, + "learning_rate": 7.902703249974689e-05, + "loss": 1.9795, + "step": 4153 + }, + { + "epoch": 1.260889361056306, + "grad_norm": 0.4418911039829254, + "learning_rate": 7.902197023387668e-05, + "loss": 1.7098, + "step": 4154 + }, + { + "epoch": 1.2611928972529975, + "grad_norm": 0.49086809158325195, + "learning_rate": 7.901690796800648e-05, + "loss": 1.8146, + "step": 4155 + }, + { + "epoch": 1.2614964334496888, + "grad_norm": 0.5296843647956848, + "learning_rate": 7.901184570213628e-05, + "loss": 1.8665, + "step": 4156 + }, + { + "epoch": 1.2617999696463804, + "grad_norm": 0.58831787109375, + "learning_rate": 7.900678343626608e-05, + "loss": 1.17, + "step": 4157 + }, + { + "epoch": 1.2621035058430718, + "grad_norm": 0.6061310768127441, + "learning_rate": 7.900172117039587e-05, + "loss": 1.2425, + "step": 4158 + }, + { + "epoch": 1.2624070420397633, + "grad_norm": 0.5422036647796631, + "learning_rate": 7.899665890452567e-05, + "loss": 1.8006, + "step": 4159 + }, + { + "epoch": 1.2627105782364547, + "grad_norm": 0.5529537796974182, + "learning_rate": 7.899159663865546e-05, + "loss": 1.4331, + "step": 4160 + }, + { + "epoch": 1.2630141144331462, + "grad_norm": 0.5384377241134644, + "learning_rate": 7.898653437278526e-05, + "loss": 1.7835, + "step": 4161 + }, + { + "epoch": 1.2633176506298376, + "grad_norm": 0.47016119956970215, + "learning_rate": 7.898147210691505e-05, + "loss": 1.2891, + "step": 4162 + }, + { + "epoch": 1.263621186826529, + "grad_norm": 0.5672861337661743, + "learning_rate": 7.897640984104485e-05, + "loss": 1.8462, + "step": 4163 + }, + { + "epoch": 1.2639247230232205, + "grad_norm": 0.5559744834899902, + "learning_rate": 7.897134757517464e-05, + "loss": 1.6961, + "step": 4164 + }, + { + "epoch": 1.2642282592199119, + "grad_norm": 0.5468736886978149, + "learning_rate": 7.896628530930445e-05, + "loss": 1.9711, + "step": 4165 + }, + { + "epoch": 1.2645317954166035, + "grad_norm": 0.5326992869377136, + "learning_rate": 7.896122304343425e-05, + "loss": 1.8224, + "step": 4166 + }, + { + "epoch": 1.2648353316132948, + "grad_norm": 0.5643085837364197, + "learning_rate": 7.895616077756404e-05, + "loss": 1.4387, + "step": 4167 + }, + { + "epoch": 1.2651388678099864, + "grad_norm": 0.5054870247840881, + "learning_rate": 7.895109851169383e-05, + "loss": 1.615, + "step": 4168 + }, + { + "epoch": 1.2654424040066778, + "grad_norm": 0.47247734665870667, + "learning_rate": 7.894603624582363e-05, + "loss": 1.3798, + "step": 4169 + }, + { + "epoch": 1.2657459402033693, + "grad_norm": 1.0349562168121338, + "learning_rate": 7.894097397995344e-05, + "loss": 1.2852, + "step": 4170 + }, + { + "epoch": 1.2660494764000607, + "grad_norm": 0.4410361647605896, + "learning_rate": 7.893591171408323e-05, + "loss": 1.1724, + "step": 4171 + }, + { + "epoch": 1.2663530125967521, + "grad_norm": 0.4972288906574249, + "learning_rate": 7.893084944821303e-05, + "loss": 1.8946, + "step": 4172 + }, + { + "epoch": 1.2666565487934436, + "grad_norm": 0.5441880822181702, + "learning_rate": 7.892578718234282e-05, + "loss": 1.5582, + "step": 4173 + }, + { + "epoch": 1.266960084990135, + "grad_norm": 0.6143850088119507, + "learning_rate": 7.892072491647262e-05, + "loss": 1.5693, + "step": 4174 + }, + { + "epoch": 1.2672636211868264, + "grad_norm": 0.49787527322769165, + "learning_rate": 7.891566265060241e-05, + "loss": 1.7123, + "step": 4175 + }, + { + "epoch": 1.2675671573835179, + "grad_norm": 0.5437433123588562, + "learning_rate": 7.891060038473222e-05, + "loss": 1.4882, + "step": 4176 + }, + { + "epoch": 1.2678706935802095, + "grad_norm": 0.4889003038406372, + "learning_rate": 7.890553811886202e-05, + "loss": 1.6611, + "step": 4177 + }, + { + "epoch": 1.268174229776901, + "grad_norm": 0.5215193629264832, + "learning_rate": 7.890047585299181e-05, + "loss": 1.7556, + "step": 4178 + }, + { + "epoch": 1.2684777659735924, + "grad_norm": 0.5660579204559326, + "learning_rate": 7.88954135871216e-05, + "loss": 1.6121, + "step": 4179 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.4969503879547119, + "learning_rate": 7.88903513212514e-05, + "loss": 1.6405, + "step": 4180 + }, + { + "epoch": 1.2690848383669753, + "grad_norm": 0.528002142906189, + "learning_rate": 7.88852890553812e-05, + "loss": 1.4562, + "step": 4181 + }, + { + "epoch": 1.2693883745636667, + "grad_norm": 0.5257166028022766, + "learning_rate": 7.888022678951099e-05, + "loss": 0.9778, + "step": 4182 + }, + { + "epoch": 1.2696919107603581, + "grad_norm": 0.6283448934555054, + "learning_rate": 7.887516452364078e-05, + "loss": 1.6004, + "step": 4183 + }, + { + "epoch": 1.2699954469570496, + "grad_norm": 0.5746490359306335, + "learning_rate": 7.887010225777058e-05, + "loss": 2.0969, + "step": 4184 + }, + { + "epoch": 1.270298983153741, + "grad_norm": 0.5110850930213928, + "learning_rate": 7.886503999190037e-05, + "loss": 1.608, + "step": 4185 + }, + { + "epoch": 1.2706025193504327, + "grad_norm": 0.48699095845222473, + "learning_rate": 7.885997772603018e-05, + "loss": 1.8913, + "step": 4186 + }, + { + "epoch": 1.2709060555471239, + "grad_norm": 0.6163387894630432, + "learning_rate": 7.885491546015998e-05, + "loss": 1.9176, + "step": 4187 + }, + { + "epoch": 1.2712095917438155, + "grad_norm": 0.5177997350692749, + "learning_rate": 7.884985319428977e-05, + "loss": 1.343, + "step": 4188 + }, + { + "epoch": 1.271513127940507, + "grad_norm": 0.5938069820404053, + "learning_rate": 7.884479092841957e-05, + "loss": 1.4215, + "step": 4189 + }, + { + "epoch": 1.2718166641371984, + "grad_norm": 0.5924432277679443, + "learning_rate": 7.883972866254936e-05, + "loss": 1.6975, + "step": 4190 + }, + { + "epoch": 1.2721202003338898, + "grad_norm": 0.4476606845855713, + "learning_rate": 7.883466639667916e-05, + "loss": 1.5248, + "step": 4191 + }, + { + "epoch": 1.2724237365305813, + "grad_norm": 0.5014340281486511, + "learning_rate": 7.882960413080895e-05, + "loss": 1.8972, + "step": 4192 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.5956346988677979, + "learning_rate": 7.882454186493875e-05, + "loss": 1.634, + "step": 4193 + }, + { + "epoch": 1.2730308089239641, + "grad_norm": 0.4889664947986603, + "learning_rate": 7.881947959906854e-05, + "loss": 1.6317, + "step": 4194 + }, + { + "epoch": 1.2733343451206556, + "grad_norm": 0.6264504194259644, + "learning_rate": 7.881441733319835e-05, + "loss": 1.1283, + "step": 4195 + }, + { + "epoch": 1.273637881317347, + "grad_norm": 0.5363176465034485, + "learning_rate": 7.880935506732814e-05, + "loss": 1.8875, + "step": 4196 + }, + { + "epoch": 1.2739414175140387, + "grad_norm": 0.6158793568611145, + "learning_rate": 7.880429280145794e-05, + "loss": 1.6791, + "step": 4197 + }, + { + "epoch": 1.2742449537107299, + "grad_norm": 0.500588059425354, + "learning_rate": 7.879923053558773e-05, + "loss": 1.8019, + "step": 4198 + }, + { + "epoch": 1.2745484899074215, + "grad_norm": 0.40965986251831055, + "learning_rate": 7.879416826971753e-05, + "loss": 1.1229, + "step": 4199 + }, + { + "epoch": 1.274852026104113, + "grad_norm": 0.5436606407165527, + "learning_rate": 7.878910600384732e-05, + "loss": 1.3439, + "step": 4200 + }, + { + "epoch": 1.2751555623008044, + "grad_norm": 0.4682038724422455, + "learning_rate": 7.878404373797712e-05, + "loss": 1.3136, + "step": 4201 + }, + { + "epoch": 1.2754590984974958, + "grad_norm": 0.5896442532539368, + "learning_rate": 7.877898147210691e-05, + "loss": 1.8639, + "step": 4202 + }, + { + "epoch": 1.2757626346941873, + "grad_norm": 0.47574663162231445, + "learning_rate": 7.877391920623671e-05, + "loss": 1.8716, + "step": 4203 + }, + { + "epoch": 1.2760661708908787, + "grad_norm": 0.48415321111679077, + "learning_rate": 7.876885694036652e-05, + "loss": 1.8162, + "step": 4204 + }, + { + "epoch": 1.2763697070875701, + "grad_norm": 0.45066776871681213, + "learning_rate": 7.876379467449631e-05, + "loss": 1.8577, + "step": 4205 + }, + { + "epoch": 1.2766732432842616, + "grad_norm": 0.719373345375061, + "learning_rate": 7.87587324086261e-05, + "loss": 1.3966, + "step": 4206 + }, + { + "epoch": 1.276976779480953, + "grad_norm": 0.5312978029251099, + "learning_rate": 7.87536701427559e-05, + "loss": 1.7253, + "step": 4207 + }, + { + "epoch": 1.2772803156776447, + "grad_norm": 0.46591347455978394, + "learning_rate": 7.87486078768857e-05, + "loss": 1.5623, + "step": 4208 + }, + { + "epoch": 1.277583851874336, + "grad_norm": 0.524597704410553, + "learning_rate": 7.874354561101549e-05, + "loss": 1.5096, + "step": 4209 + }, + { + "epoch": 1.2778873880710275, + "grad_norm": 0.5499995350837708, + "learning_rate": 7.873848334514528e-05, + "loss": 1.077, + "step": 4210 + }, + { + "epoch": 1.278190924267719, + "grad_norm": 0.5146958231925964, + "learning_rate": 7.873342107927508e-05, + "loss": 1.3829, + "step": 4211 + }, + { + "epoch": 1.2784944604644104, + "grad_norm": 0.4905032217502594, + "learning_rate": 7.872835881340487e-05, + "loss": 2.0092, + "step": 4212 + }, + { + "epoch": 1.2787979966611018, + "grad_norm": 0.5487980246543884, + "learning_rate": 7.872329654753467e-05, + "loss": 1.7831, + "step": 4213 + }, + { + "epoch": 1.2791015328577933, + "grad_norm": 0.6084244251251221, + "learning_rate": 7.871823428166448e-05, + "loss": 1.6648, + "step": 4214 + }, + { + "epoch": 1.2794050690544847, + "grad_norm": 0.4461166560649872, + "learning_rate": 7.871317201579429e-05, + "loss": 1.8962, + "step": 4215 + }, + { + "epoch": 1.2797086052511761, + "grad_norm": 0.5374088287353516, + "learning_rate": 7.870810974992408e-05, + "loss": 2.0232, + "step": 4216 + }, + { + "epoch": 1.2800121414478678, + "grad_norm": 0.75820392370224, + "learning_rate": 7.870304748405387e-05, + "loss": 1.816, + "step": 4217 + }, + { + "epoch": 1.280315677644559, + "grad_norm": 0.5131720900535583, + "learning_rate": 7.869798521818367e-05, + "loss": 1.6794, + "step": 4218 + }, + { + "epoch": 1.2806192138412507, + "grad_norm": 0.5353872179985046, + "learning_rate": 7.869292295231346e-05, + "loss": 1.6256, + "step": 4219 + }, + { + "epoch": 1.280922750037942, + "grad_norm": 0.5746525526046753, + "learning_rate": 7.868786068644326e-05, + "loss": 1.4688, + "step": 4220 + }, + { + "epoch": 1.2812262862346335, + "grad_norm": 0.4623584747314453, + "learning_rate": 7.868279842057305e-05, + "loss": 1.8531, + "step": 4221 + }, + { + "epoch": 1.281529822431325, + "grad_norm": 0.45438817143440247, + "learning_rate": 7.867773615470285e-05, + "loss": 2.0002, + "step": 4222 + }, + { + "epoch": 1.2818333586280164, + "grad_norm": 0.5203740000724792, + "learning_rate": 7.867267388883264e-05, + "loss": 1.5445, + "step": 4223 + }, + { + "epoch": 1.2821368948247078, + "grad_norm": 0.5143687129020691, + "learning_rate": 7.866761162296244e-05, + "loss": 1.6431, + "step": 4224 + }, + { + "epoch": 1.2824404310213993, + "grad_norm": 0.42706185579299927, + "learning_rate": 7.866254935709225e-05, + "loss": 1.6228, + "step": 4225 + }, + { + "epoch": 1.2827439672180907, + "grad_norm": 0.49767574667930603, + "learning_rate": 7.865748709122204e-05, + "loss": 1.8411, + "step": 4226 + }, + { + "epoch": 1.2830475034147821, + "grad_norm": 1.3535252809524536, + "learning_rate": 7.865242482535184e-05, + "loss": 1.4473, + "step": 4227 + }, + { + "epoch": 1.2833510396114738, + "grad_norm": 0.8082306981086731, + "learning_rate": 7.864736255948163e-05, + "loss": 1.8145, + "step": 4228 + }, + { + "epoch": 1.283654575808165, + "grad_norm": 0.5417949557304382, + "learning_rate": 7.864230029361143e-05, + "loss": 1.8166, + "step": 4229 + }, + { + "epoch": 1.2839581120048567, + "grad_norm": 0.45955365896224976, + "learning_rate": 7.863723802774122e-05, + "loss": 1.8228, + "step": 4230 + }, + { + "epoch": 1.284261648201548, + "grad_norm": 0.512393593788147, + "learning_rate": 7.863217576187102e-05, + "loss": 1.7234, + "step": 4231 + }, + { + "epoch": 1.2845651843982395, + "grad_norm": 0.6876609325408936, + "learning_rate": 7.862711349600081e-05, + "loss": 1.6925, + "step": 4232 + }, + { + "epoch": 1.284868720594931, + "grad_norm": 0.4111330509185791, + "learning_rate": 7.86220512301306e-05, + "loss": 1.5009, + "step": 4233 + }, + { + "epoch": 1.2851722567916224, + "grad_norm": 1.1861661672592163, + "learning_rate": 7.861698896426041e-05, + "loss": 1.8235, + "step": 4234 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.47272002696990967, + "learning_rate": 7.861192669839021e-05, + "loss": 1.8286, + "step": 4235 + }, + { + "epoch": 1.2857793291850053, + "grad_norm": 0.5344218015670776, + "learning_rate": 7.860686443252e-05, + "loss": 1.72, + "step": 4236 + }, + { + "epoch": 1.2860828653816967, + "grad_norm": 0.6022644639015198, + "learning_rate": 7.86018021666498e-05, + "loss": 1.6321, + "step": 4237 + }, + { + "epoch": 1.2863864015783881, + "grad_norm": 0.47625505924224854, + "learning_rate": 7.859673990077959e-05, + "loss": 1.5977, + "step": 4238 + }, + { + "epoch": 1.2866899377750798, + "grad_norm": 0.5062893033027649, + "learning_rate": 7.859167763490939e-05, + "loss": 1.7365, + "step": 4239 + }, + { + "epoch": 1.286993473971771, + "grad_norm": 0.6051474213600159, + "learning_rate": 7.858661536903918e-05, + "loss": 1.141, + "step": 4240 + }, + { + "epoch": 1.2872970101684627, + "grad_norm": 0.47197285294532776, + "learning_rate": 7.858155310316898e-05, + "loss": 1.4489, + "step": 4241 + }, + { + "epoch": 1.287600546365154, + "grad_norm": 0.942813515663147, + "learning_rate": 7.857649083729877e-05, + "loss": 1.5708, + "step": 4242 + }, + { + "epoch": 1.2879040825618455, + "grad_norm": 0.574316143989563, + "learning_rate": 7.857142857142858e-05, + "loss": 1.5814, + "step": 4243 + }, + { + "epoch": 1.288207618758537, + "grad_norm": 0.491734117269516, + "learning_rate": 7.856636630555837e-05, + "loss": 2.1318, + "step": 4244 + }, + { + "epoch": 1.2885111549552284, + "grad_norm": 0.5177463889122009, + "learning_rate": 7.856130403968817e-05, + "loss": 1.1797, + "step": 4245 + }, + { + "epoch": 1.2888146911519198, + "grad_norm": 0.5329721570014954, + "learning_rate": 7.855624177381796e-05, + "loss": 1.8383, + "step": 4246 + }, + { + "epoch": 1.2891182273486113, + "grad_norm": 0.5926728248596191, + "learning_rate": 7.855117950794776e-05, + "loss": 1.5035, + "step": 4247 + }, + { + "epoch": 1.289421763545303, + "grad_norm": 0.8679110407829285, + "learning_rate": 7.854611724207755e-05, + "loss": 1.39, + "step": 4248 + }, + { + "epoch": 1.2897252997419941, + "grad_norm": 0.5424472689628601, + "learning_rate": 7.854105497620735e-05, + "loss": 1.4622, + "step": 4249 + }, + { + "epoch": 1.2900288359386858, + "grad_norm": 0.556882917881012, + "learning_rate": 7.853599271033714e-05, + "loss": 2.0715, + "step": 4250 + }, + { + "epoch": 1.2903323721353772, + "grad_norm": 0.4874313175678253, + "learning_rate": 7.853093044446694e-05, + "loss": 1.3539, + "step": 4251 + }, + { + "epoch": 1.2906359083320686, + "grad_norm": 0.5125330090522766, + "learning_rate": 7.852586817859673e-05, + "loss": 1.2546, + "step": 4252 + }, + { + "epoch": 1.29093944452876, + "grad_norm": 0.5772746205329895, + "learning_rate": 7.852080591272654e-05, + "loss": 1.8095, + "step": 4253 + }, + { + "epoch": 1.2912429807254515, + "grad_norm": 0.5157619118690491, + "learning_rate": 7.851574364685634e-05, + "loss": 1.4485, + "step": 4254 + }, + { + "epoch": 1.291546516922143, + "grad_norm": 0.5483890771865845, + "learning_rate": 7.851068138098613e-05, + "loss": 1.4703, + "step": 4255 + }, + { + "epoch": 1.2918500531188344, + "grad_norm": 0.5271161794662476, + "learning_rate": 7.850561911511593e-05, + "loss": 1.6138, + "step": 4256 + }, + { + "epoch": 1.2921535893155258, + "grad_norm": 0.5618970990180969, + "learning_rate": 7.850055684924572e-05, + "loss": 1.5364, + "step": 4257 + }, + { + "epoch": 1.2924571255122173, + "grad_norm": 0.5047764182090759, + "learning_rate": 7.849549458337552e-05, + "loss": 1.7535, + "step": 4258 + }, + { + "epoch": 1.292760661708909, + "grad_norm": 0.5401104688644409, + "learning_rate": 7.849043231750532e-05, + "loss": 1.6222, + "step": 4259 + }, + { + "epoch": 1.2930641979056001, + "grad_norm": 0.5193315148353577, + "learning_rate": 7.848537005163512e-05, + "loss": 1.5011, + "step": 4260 + }, + { + "epoch": 1.2933677341022918, + "grad_norm": 0.5561128258705139, + "learning_rate": 7.848030778576491e-05, + "loss": 1.474, + "step": 4261 + }, + { + "epoch": 1.2936712702989832, + "grad_norm": 0.4799818992614746, + "learning_rate": 7.847524551989471e-05, + "loss": 1.1537, + "step": 4262 + }, + { + "epoch": 1.2939748064956746, + "grad_norm": 0.47398802638053894, + "learning_rate": 7.84701832540245e-05, + "loss": 1.3131, + "step": 4263 + }, + { + "epoch": 1.294278342692366, + "grad_norm": 0.5396584272384644, + "learning_rate": 7.846512098815431e-05, + "loss": 1.8291, + "step": 4264 + }, + { + "epoch": 1.2945818788890575, + "grad_norm": 0.5745276808738708, + "learning_rate": 7.84600587222841e-05, + "loss": 1.4679, + "step": 4265 + }, + { + "epoch": 1.294885415085749, + "grad_norm": 0.4571831226348877, + "learning_rate": 7.84549964564139e-05, + "loss": 1.2732, + "step": 4266 + }, + { + "epoch": 1.2951889512824404, + "grad_norm": 0.5755758285522461, + "learning_rate": 7.84499341905437e-05, + "loss": 1.5643, + "step": 4267 + }, + { + "epoch": 1.2954924874791318, + "grad_norm": 0.4864906966686249, + "learning_rate": 7.844487192467349e-05, + "loss": 1.7432, + "step": 4268 + }, + { + "epoch": 1.2957960236758232, + "grad_norm": 0.5033892393112183, + "learning_rate": 7.843980965880329e-05, + "loss": 1.8336, + "step": 4269 + }, + { + "epoch": 1.296099559872515, + "grad_norm": 0.5319492220878601, + "learning_rate": 7.843474739293308e-05, + "loss": 1.0529, + "step": 4270 + }, + { + "epoch": 1.2964030960692061, + "grad_norm": 0.5821359157562256, + "learning_rate": 7.842968512706287e-05, + "loss": 1.513, + "step": 4271 + }, + { + "epoch": 1.2967066322658978, + "grad_norm": 0.5181142687797546, + "learning_rate": 7.842462286119267e-05, + "loss": 1.759, + "step": 4272 + }, + { + "epoch": 1.2970101684625892, + "grad_norm": 0.614640474319458, + "learning_rate": 7.841956059532248e-05, + "loss": 2.1172, + "step": 4273 + }, + { + "epoch": 1.2973137046592806, + "grad_norm": 0.5049278736114502, + "learning_rate": 7.841449832945227e-05, + "loss": 1.7877, + "step": 4274 + }, + { + "epoch": 1.297617240855972, + "grad_norm": 0.9571356177330017, + "learning_rate": 7.840943606358207e-05, + "loss": 1.4665, + "step": 4275 + }, + { + "epoch": 1.2979207770526635, + "grad_norm": 0.9249529242515564, + "learning_rate": 7.840437379771186e-05, + "loss": 1.066, + "step": 4276 + }, + { + "epoch": 1.298224313249355, + "grad_norm": 0.4696667492389679, + "learning_rate": 7.839931153184166e-05, + "loss": 1.5052, + "step": 4277 + }, + { + "epoch": 1.2985278494460464, + "grad_norm": 0.44224581122398376, + "learning_rate": 7.839424926597145e-05, + "loss": 1.3647, + "step": 4278 + }, + { + "epoch": 1.298831385642738, + "grad_norm": 0.4960525631904602, + "learning_rate": 7.838918700010125e-05, + "loss": 1.6771, + "step": 4279 + }, + { + "epoch": 1.2991349218394292, + "grad_norm": 0.5027126669883728, + "learning_rate": 7.838412473423104e-05, + "loss": 1.4619, + "step": 4280 + }, + { + "epoch": 1.299438458036121, + "grad_norm": 0.547900378704071, + "learning_rate": 7.837906246836084e-05, + "loss": 1.768, + "step": 4281 + }, + { + "epoch": 1.2997419942328123, + "grad_norm": 0.6043573021888733, + "learning_rate": 7.837400020249064e-05, + "loss": 1.4069, + "step": 4282 + }, + { + "epoch": 1.3000455304295038, + "grad_norm": 0.6143855452537537, + "learning_rate": 7.836893793662044e-05, + "loss": 1.7363, + "step": 4283 + }, + { + "epoch": 1.3003490666261952, + "grad_norm": 0.37396860122680664, + "learning_rate": 7.836387567075023e-05, + "loss": 1.2622, + "step": 4284 + }, + { + "epoch": 1.3006526028228866, + "grad_norm": 0.5041464567184448, + "learning_rate": 7.835881340488003e-05, + "loss": 1.8698, + "step": 4285 + }, + { + "epoch": 1.300956139019578, + "grad_norm": 0.5512543320655823, + "learning_rate": 7.835375113900982e-05, + "loss": 1.7105, + "step": 4286 + }, + { + "epoch": 1.3012596752162695, + "grad_norm": 0.5649422407150269, + "learning_rate": 7.834868887313962e-05, + "loss": 1.859, + "step": 4287 + }, + { + "epoch": 1.301563211412961, + "grad_norm": 0.5447474718093872, + "learning_rate": 7.834362660726941e-05, + "loss": 1.5593, + "step": 4288 + }, + { + "epoch": 1.3018667476096524, + "grad_norm": 0.601941704750061, + "learning_rate": 7.833856434139921e-05, + "loss": 1.0568, + "step": 4289 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.527382493019104, + "learning_rate": 7.8333502075529e-05, + "loss": 1.5618, + "step": 4290 + }, + { + "epoch": 1.3024738200030352, + "grad_norm": 0.6012628078460693, + "learning_rate": 7.83284398096588e-05, + "loss": 1.6444, + "step": 4291 + }, + { + "epoch": 1.302777356199727, + "grad_norm": 0.552227795124054, + "learning_rate": 7.83233775437886e-05, + "loss": 1.5268, + "step": 4292 + }, + { + "epoch": 1.3030808923964183, + "grad_norm": 0.6038103699684143, + "learning_rate": 7.83183152779184e-05, + "loss": 1.6447, + "step": 4293 + }, + { + "epoch": 1.3033844285931098, + "grad_norm": 0.6079698801040649, + "learning_rate": 7.83132530120482e-05, + "loss": 1.6474, + "step": 4294 + }, + { + "epoch": 1.3036879647898012, + "grad_norm": 0.5434030890464783, + "learning_rate": 7.830819074617799e-05, + "loss": 1.7963, + "step": 4295 + }, + { + "epoch": 1.3039915009864926, + "grad_norm": 0.4274088144302368, + "learning_rate": 7.830312848030779e-05, + "loss": 1.5818, + "step": 4296 + }, + { + "epoch": 1.304295037183184, + "grad_norm": 0.5364022254943848, + "learning_rate": 7.829806621443758e-05, + "loss": 1.6226, + "step": 4297 + }, + { + "epoch": 1.3045985733798755, + "grad_norm": 0.47048020362854004, + "learning_rate": 7.829300394856738e-05, + "loss": 1.7852, + "step": 4298 + }, + { + "epoch": 1.304902109576567, + "grad_norm": 0.9063312411308289, + "learning_rate": 7.828794168269717e-05, + "loss": 1.5717, + "step": 4299 + }, + { + "epoch": 1.3052056457732584, + "grad_norm": 0.5323641300201416, + "learning_rate": 7.828287941682696e-05, + "loss": 1.6635, + "step": 4300 + }, + { + "epoch": 1.30550918196995, + "grad_norm": 0.4425245523452759, + "learning_rate": 7.827781715095677e-05, + "loss": 1.9034, + "step": 4301 + }, + { + "epoch": 1.3058127181666412, + "grad_norm": 0.553707480430603, + "learning_rate": 7.827275488508657e-05, + "loss": 1.7733, + "step": 4302 + }, + { + "epoch": 1.306116254363333, + "grad_norm": 0.5723200440406799, + "learning_rate": 7.826769261921636e-05, + "loss": 1.6769, + "step": 4303 + }, + { + "epoch": 1.3064197905600243, + "grad_norm": 0.9885048866271973, + "learning_rate": 7.826263035334617e-05, + "loss": 0.931, + "step": 4304 + }, + { + "epoch": 1.3067233267567158, + "grad_norm": 0.506885826587677, + "learning_rate": 7.825756808747597e-05, + "loss": 1.795, + "step": 4305 + }, + { + "epoch": 1.3070268629534072, + "grad_norm": 0.5606210231781006, + "learning_rate": 7.825250582160576e-05, + "loss": 1.2253, + "step": 4306 + }, + { + "epoch": 1.3073303991500986, + "grad_norm": 0.7443845868110657, + "learning_rate": 7.824744355573556e-05, + "loss": 1.9322, + "step": 4307 + }, + { + "epoch": 1.30763393534679, + "grad_norm": 0.5564523935317993, + "learning_rate": 7.824238128986535e-05, + "loss": 1.6893, + "step": 4308 + }, + { + "epoch": 1.3079374715434815, + "grad_norm": 0.5279399752616882, + "learning_rate": 7.823731902399514e-05, + "loss": 1.8677, + "step": 4309 + }, + { + "epoch": 1.308241007740173, + "grad_norm": 0.43700218200683594, + "learning_rate": 7.823225675812494e-05, + "loss": 1.9146, + "step": 4310 + }, + { + "epoch": 1.3085445439368644, + "grad_norm": 0.49414920806884766, + "learning_rate": 7.822719449225473e-05, + "loss": 1.5188, + "step": 4311 + }, + { + "epoch": 1.308848080133556, + "grad_norm": 0.5282542109489441, + "learning_rate": 7.822213222638454e-05, + "loss": 1.869, + "step": 4312 + }, + { + "epoch": 1.3091516163302475, + "grad_norm": 0.5799091458320618, + "learning_rate": 7.821706996051434e-05, + "loss": 1.4757, + "step": 4313 + }, + { + "epoch": 1.309455152526939, + "grad_norm": 0.5351842045783997, + "learning_rate": 7.821200769464413e-05, + "loss": 1.9567, + "step": 4314 + }, + { + "epoch": 1.3097586887236303, + "grad_norm": 0.7459053993225098, + "learning_rate": 7.820694542877393e-05, + "loss": 1.3684, + "step": 4315 + }, + { + "epoch": 1.3100622249203218, + "grad_norm": 0.5702859163284302, + "learning_rate": 7.820188316290372e-05, + "loss": 1.9038, + "step": 4316 + }, + { + "epoch": 1.3103657611170132, + "grad_norm": 0.5957212448120117, + "learning_rate": 7.819682089703352e-05, + "loss": 1.6239, + "step": 4317 + }, + { + "epoch": 1.3106692973137046, + "grad_norm": 0.5540712475776672, + "learning_rate": 7.819175863116331e-05, + "loss": 1.7692, + "step": 4318 + }, + { + "epoch": 1.310972833510396, + "grad_norm": 0.5562117099761963, + "learning_rate": 7.81866963652931e-05, + "loss": 1.5726, + "step": 4319 + }, + { + "epoch": 1.3112763697070875, + "grad_norm": 0.597484290599823, + "learning_rate": 7.81816340994229e-05, + "loss": 1.2881, + "step": 4320 + }, + { + "epoch": 1.3115799059037792, + "grad_norm": 0.4560807943344116, + "learning_rate": 7.817657183355271e-05, + "loss": 1.6298, + "step": 4321 + }, + { + "epoch": 1.3118834421004704, + "grad_norm": 0.5292729139328003, + "learning_rate": 7.81715095676825e-05, + "loss": 1.427, + "step": 4322 + }, + { + "epoch": 1.312186978297162, + "grad_norm": 0.5305735468864441, + "learning_rate": 7.81664473018123e-05, + "loss": 1.457, + "step": 4323 + }, + { + "epoch": 1.3124905144938535, + "grad_norm": 0.4915447533130646, + "learning_rate": 7.81613850359421e-05, + "loss": 1.523, + "step": 4324 + }, + { + "epoch": 1.312794050690545, + "grad_norm": 0.5826139450073242, + "learning_rate": 7.815632277007189e-05, + "loss": 1.562, + "step": 4325 + }, + { + "epoch": 1.3130975868872363, + "grad_norm": 0.39976975321769714, + "learning_rate": 7.815126050420168e-05, + "loss": 1.4048, + "step": 4326 + }, + { + "epoch": 1.3134011230839278, + "grad_norm": 0.4750303030014038, + "learning_rate": 7.814619823833148e-05, + "loss": 1.9087, + "step": 4327 + }, + { + "epoch": 1.3137046592806192, + "grad_norm": 0.6270216703414917, + "learning_rate": 7.814113597246127e-05, + "loss": 1.6378, + "step": 4328 + }, + { + "epoch": 1.3140081954773106, + "grad_norm": 0.553433895111084, + "learning_rate": 7.813607370659107e-05, + "loss": 1.6217, + "step": 4329 + }, + { + "epoch": 1.314311731674002, + "grad_norm": 0.568313717842102, + "learning_rate": 7.813101144072086e-05, + "loss": 1.875, + "step": 4330 + }, + { + "epoch": 1.3146152678706935, + "grad_norm": 0.47426313161849976, + "learning_rate": 7.812594917485067e-05, + "loss": 1.8395, + "step": 4331 + }, + { + "epoch": 1.3149188040673851, + "grad_norm": 0.5144320726394653, + "learning_rate": 7.812088690898047e-05, + "loss": 1.6856, + "step": 4332 + }, + { + "epoch": 1.3152223402640764, + "grad_norm": 0.4862743616104126, + "learning_rate": 7.811582464311026e-05, + "loss": 1.6403, + "step": 4333 + }, + { + "epoch": 1.315525876460768, + "grad_norm": 0.5968044996261597, + "learning_rate": 7.811076237724006e-05, + "loss": 1.5937, + "step": 4334 + }, + { + "epoch": 1.3158294126574595, + "grad_norm": 0.8257614374160767, + "learning_rate": 7.810570011136985e-05, + "loss": 1.6954, + "step": 4335 + }, + { + "epoch": 1.3161329488541509, + "grad_norm": 0.8569319844245911, + "learning_rate": 7.810063784549965e-05, + "loss": 1.1517, + "step": 4336 + }, + { + "epoch": 1.3164364850508423, + "grad_norm": 0.5136444568634033, + "learning_rate": 7.809557557962944e-05, + "loss": 1.4734, + "step": 4337 + }, + { + "epoch": 1.3167400212475338, + "grad_norm": 0.4905669093132019, + "learning_rate": 7.809051331375923e-05, + "loss": 1.6479, + "step": 4338 + }, + { + "epoch": 1.3170435574442252, + "grad_norm": 0.5407041311264038, + "learning_rate": 7.808545104788903e-05, + "loss": 1.7631, + "step": 4339 + }, + { + "epoch": 1.3173470936409166, + "grad_norm": 0.5710767507553101, + "learning_rate": 7.808038878201884e-05, + "loss": 1.3063, + "step": 4340 + }, + { + "epoch": 1.317650629837608, + "grad_norm": 0.7482556104660034, + "learning_rate": 7.807532651614863e-05, + "loss": 1.4685, + "step": 4341 + }, + { + "epoch": 1.3179541660342995, + "grad_norm": 0.5573562979698181, + "learning_rate": 7.807026425027843e-05, + "loss": 1.0559, + "step": 4342 + }, + { + "epoch": 1.3182577022309911, + "grad_norm": 0.5664482712745667, + "learning_rate": 7.806520198440822e-05, + "loss": 1.4078, + "step": 4343 + }, + { + "epoch": 1.3185612384276826, + "grad_norm": 0.5078407526016235, + "learning_rate": 7.806013971853802e-05, + "loss": 1.8284, + "step": 4344 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.5389295220375061, + "learning_rate": 7.805507745266781e-05, + "loss": 2.0116, + "step": 4345 + }, + { + "epoch": 1.3191683108210654, + "grad_norm": 0.5661474466323853, + "learning_rate": 7.80500151867976e-05, + "loss": 1.4659, + "step": 4346 + }, + { + "epoch": 1.3194718470177569, + "grad_norm": 0.5620774626731873, + "learning_rate": 7.80449529209274e-05, + "loss": 1.8105, + "step": 4347 + }, + { + "epoch": 1.3197753832144483, + "grad_norm": 0.7904017567634583, + "learning_rate": 7.803989065505721e-05, + "loss": 1.7561, + "step": 4348 + }, + { + "epoch": 1.3200789194111398, + "grad_norm": 0.5164950489997864, + "learning_rate": 7.8034828389187e-05, + "loss": 1.7777, + "step": 4349 + }, + { + "epoch": 1.3203824556078312, + "grad_norm": 0.4666251838207245, + "learning_rate": 7.80297661233168e-05, + "loss": 1.8709, + "step": 4350 + }, + { + "epoch": 1.3206859918045226, + "grad_norm": 1.2652373313903809, + "learning_rate": 7.802470385744661e-05, + "loss": 1.7007, + "step": 4351 + }, + { + "epoch": 1.3209895280012143, + "grad_norm": 0.527914822101593, + "learning_rate": 7.80196415915764e-05, + "loss": 1.7681, + "step": 4352 + }, + { + "epoch": 1.3212930641979055, + "grad_norm": 0.49614670872688293, + "learning_rate": 7.80145793257062e-05, + "loss": 1.8928, + "step": 4353 + }, + { + "epoch": 1.3215966003945971, + "grad_norm": 0.5666537284851074, + "learning_rate": 7.800951705983599e-05, + "loss": 1.645, + "step": 4354 + }, + { + "epoch": 1.3219001365912886, + "grad_norm": 0.479524701833725, + "learning_rate": 7.800445479396579e-05, + "loss": 1.6011, + "step": 4355 + }, + { + "epoch": 1.32220367278798, + "grad_norm": 0.6410484313964844, + "learning_rate": 7.799939252809558e-05, + "loss": 1.4003, + "step": 4356 + }, + { + "epoch": 1.3225072089846714, + "grad_norm": 0.5031597018241882, + "learning_rate": 7.799433026222538e-05, + "loss": 1.618, + "step": 4357 + }, + { + "epoch": 1.3228107451813629, + "grad_norm": 0.5946468710899353, + "learning_rate": 7.798926799635517e-05, + "loss": 1.8065, + "step": 4358 + }, + { + "epoch": 1.3231142813780543, + "grad_norm": 0.5310636758804321, + "learning_rate": 7.798420573048497e-05, + "loss": 1.6345, + "step": 4359 + }, + { + "epoch": 1.3234178175747457, + "grad_norm": 0.4639948904514313, + "learning_rate": 7.797914346461477e-05, + "loss": 1.4134, + "step": 4360 + }, + { + "epoch": 1.3237213537714372, + "grad_norm": 0.5224514007568359, + "learning_rate": 7.797408119874457e-05, + "loss": 1.6937, + "step": 4361 + }, + { + "epoch": 1.3240248899681286, + "grad_norm": 0.5320509672164917, + "learning_rate": 7.796901893287436e-05, + "loss": 1.6988, + "step": 4362 + }, + { + "epoch": 1.3243284261648203, + "grad_norm": 0.46581387519836426, + "learning_rate": 7.796395666700416e-05, + "loss": 1.2369, + "step": 4363 + }, + { + "epoch": 1.3246319623615115, + "grad_norm": 0.5986335277557373, + "learning_rate": 7.795889440113395e-05, + "loss": 1.3976, + "step": 4364 + }, + { + "epoch": 1.3249354985582031, + "grad_norm": 0.46746957302093506, + "learning_rate": 7.795383213526375e-05, + "loss": 1.8816, + "step": 4365 + }, + { + "epoch": 1.3252390347548946, + "grad_norm": 0.505970299243927, + "learning_rate": 7.794876986939354e-05, + "loss": 1.825, + "step": 4366 + }, + { + "epoch": 1.325542570951586, + "grad_norm": 0.4795224666595459, + "learning_rate": 7.794370760352334e-05, + "loss": 1.9911, + "step": 4367 + }, + { + "epoch": 1.3258461071482774, + "grad_norm": 0.5055125951766968, + "learning_rate": 7.793864533765313e-05, + "loss": 0.8959, + "step": 4368 + }, + { + "epoch": 1.3261496433449689, + "grad_norm": 0.5013410449028015, + "learning_rate": 7.793358307178293e-05, + "loss": 1.7976, + "step": 4369 + }, + { + "epoch": 1.3264531795416603, + "grad_norm": 0.6693021655082703, + "learning_rate": 7.792852080591274e-05, + "loss": 1.3238, + "step": 4370 + }, + { + "epoch": 1.3267567157383517, + "grad_norm": 0.5832652449607849, + "learning_rate": 7.792345854004253e-05, + "loss": 1.1272, + "step": 4371 + }, + { + "epoch": 1.3270602519350432, + "grad_norm": 0.6789683103561401, + "learning_rate": 7.791839627417233e-05, + "loss": 1.6659, + "step": 4372 + }, + { + "epoch": 1.3273637881317346, + "grad_norm": 0.5950153470039368, + "learning_rate": 7.791333400830212e-05, + "loss": 1.3312, + "step": 4373 + }, + { + "epoch": 1.3276673243284263, + "grad_norm": 0.508441686630249, + "learning_rate": 7.790827174243191e-05, + "loss": 2.0006, + "step": 4374 + }, + { + "epoch": 1.3279708605251177, + "grad_norm": 0.5696821212768555, + "learning_rate": 7.790320947656171e-05, + "loss": 1.5527, + "step": 4375 + }, + { + "epoch": 1.3282743967218091, + "grad_norm": 0.5755532383918762, + "learning_rate": 7.78981472106915e-05, + "loss": 1.6013, + "step": 4376 + }, + { + "epoch": 1.3285779329185006, + "grad_norm": 0.573461651802063, + "learning_rate": 7.78930849448213e-05, + "loss": 1.5225, + "step": 4377 + }, + { + "epoch": 1.328881469115192, + "grad_norm": 0.5905348062515259, + "learning_rate": 7.78880226789511e-05, + "loss": 1.6171, + "step": 4378 + }, + { + "epoch": 1.3291850053118834, + "grad_norm": 0.5171772241592407, + "learning_rate": 7.78829604130809e-05, + "loss": 1.403, + "step": 4379 + }, + { + "epoch": 1.3294885415085749, + "grad_norm": 0.596920907497406, + "learning_rate": 7.78778981472107e-05, + "loss": 1.8381, + "step": 4380 + }, + { + "epoch": 1.3297920777052663, + "grad_norm": 0.5608156323432922, + "learning_rate": 7.787283588134049e-05, + "loss": 1.7, + "step": 4381 + }, + { + "epoch": 1.3300956139019577, + "grad_norm": 0.5419397950172424, + "learning_rate": 7.786777361547029e-05, + "loss": 1.952, + "step": 4382 + }, + { + "epoch": 1.3303991500986494, + "grad_norm": 0.5569834113121033, + "learning_rate": 7.786271134960008e-05, + "loss": 1.3201, + "step": 4383 + }, + { + "epoch": 1.3307026862953406, + "grad_norm": 0.5397844910621643, + "learning_rate": 7.785764908372988e-05, + "loss": 1.6868, + "step": 4384 + }, + { + "epoch": 1.3310062224920323, + "grad_norm": 0.5021274089813232, + "learning_rate": 7.785258681785967e-05, + "loss": 1.2401, + "step": 4385 + }, + { + "epoch": 1.3313097586887237, + "grad_norm": 0.5976943373680115, + "learning_rate": 7.784752455198947e-05, + "loss": 1.6123, + "step": 4386 + }, + { + "epoch": 1.3316132948854151, + "grad_norm": 0.5162238478660583, + "learning_rate": 7.784246228611926e-05, + "loss": 1.8259, + "step": 4387 + }, + { + "epoch": 1.3319168310821066, + "grad_norm": 0.5028786659240723, + "learning_rate": 7.783740002024907e-05, + "loss": 1.08, + "step": 4388 + }, + { + "epoch": 1.332220367278798, + "grad_norm": 0.4934920072555542, + "learning_rate": 7.783233775437886e-05, + "loss": 1.7243, + "step": 4389 + }, + { + "epoch": 1.3325239034754894, + "grad_norm": 0.40703603625297546, + "learning_rate": 7.782727548850866e-05, + "loss": 1.4138, + "step": 4390 + }, + { + "epoch": 1.3328274396721809, + "grad_norm": 0.5625724792480469, + "learning_rate": 7.782221322263845e-05, + "loss": 1.775, + "step": 4391 + }, + { + "epoch": 1.3331309758688723, + "grad_norm": 0.5374658703804016, + "learning_rate": 7.781715095676825e-05, + "loss": 1.4939, + "step": 4392 + }, + { + "epoch": 1.3334345120655637, + "grad_norm": 0.5174360275268555, + "learning_rate": 7.781208869089806e-05, + "loss": 1.6717, + "step": 4393 + }, + { + "epoch": 1.3337380482622554, + "grad_norm": 0.42424100637435913, + "learning_rate": 7.780702642502785e-05, + "loss": 1.1542, + "step": 4394 + }, + { + "epoch": 1.3340415844589466, + "grad_norm": 0.5028985142707825, + "learning_rate": 7.780196415915765e-05, + "loss": 1.8072, + "step": 4395 + }, + { + "epoch": 1.3343451206556383, + "grad_norm": 0.5722070336341858, + "learning_rate": 7.779690189328744e-05, + "loss": 1.7323, + "step": 4396 + }, + { + "epoch": 1.3346486568523297, + "grad_norm": 0.5366278290748596, + "learning_rate": 7.779183962741724e-05, + "loss": 1.7701, + "step": 4397 + }, + { + "epoch": 1.3349521930490211, + "grad_norm": 0.38516730070114136, + "learning_rate": 7.778677736154703e-05, + "loss": 1.4055, + "step": 4398 + }, + { + "epoch": 1.3352557292457126, + "grad_norm": 0.4074214696884155, + "learning_rate": 7.778171509567684e-05, + "loss": 1.8972, + "step": 4399 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.519972562789917, + "learning_rate": 7.777665282980663e-05, + "loss": 1.9597, + "step": 4400 + }, + { + "epoch": 1.3358628016390954, + "grad_norm": 0.46331217885017395, + "learning_rate": 7.777159056393643e-05, + "loss": 1.9177, + "step": 4401 + }, + { + "epoch": 1.3361663378357869, + "grad_norm": 0.5337703824043274, + "learning_rate": 7.776652829806622e-05, + "loss": 1.4516, + "step": 4402 + }, + { + "epoch": 1.3364698740324783, + "grad_norm": 0.5979052186012268, + "learning_rate": 7.776146603219602e-05, + "loss": 1.78, + "step": 4403 + }, + { + "epoch": 1.3367734102291697, + "grad_norm": 0.5446959733963013, + "learning_rate": 7.775640376632581e-05, + "loss": 1.7176, + "step": 4404 + }, + { + "epoch": 1.3370769464258614, + "grad_norm": 0.45639851689338684, + "learning_rate": 7.775134150045561e-05, + "loss": 1.1853, + "step": 4405 + }, + { + "epoch": 1.3373804826225526, + "grad_norm": 0.5213142037391663, + "learning_rate": 7.77462792345854e-05, + "loss": 1.7366, + "step": 4406 + }, + { + "epoch": 1.3376840188192443, + "grad_norm": 0.5132396221160889, + "learning_rate": 7.77412169687152e-05, + "loss": 1.5986, + "step": 4407 + }, + { + "epoch": 1.3379875550159357, + "grad_norm": 0.5052165389060974, + "learning_rate": 7.773615470284499e-05, + "loss": 1.7698, + "step": 4408 + }, + { + "epoch": 1.3382910912126271, + "grad_norm": 0.5907325148582458, + "learning_rate": 7.77310924369748e-05, + "loss": 1.6297, + "step": 4409 + }, + { + "epoch": 1.3385946274093186, + "grad_norm": 0.5656864643096924, + "learning_rate": 7.77260301711046e-05, + "loss": 1.6696, + "step": 4410 + }, + { + "epoch": 1.33889816360601, + "grad_norm": 0.6518059968948364, + "learning_rate": 7.772096790523439e-05, + "loss": 1.3919, + "step": 4411 + }, + { + "epoch": 1.3392016998027014, + "grad_norm": 0.5692440271377563, + "learning_rate": 7.771590563936418e-05, + "loss": 1.5376, + "step": 4412 + }, + { + "epoch": 1.3395052359993929, + "grad_norm": 0.5003619194030762, + "learning_rate": 7.771084337349398e-05, + "loss": 1.5989, + "step": 4413 + }, + { + "epoch": 1.3398087721960845, + "grad_norm": 0.5197275280952454, + "learning_rate": 7.770578110762377e-05, + "loss": 1.7232, + "step": 4414 + }, + { + "epoch": 1.3401123083927757, + "grad_norm": 0.498151570558548, + "learning_rate": 7.770071884175357e-05, + "loss": 1.8019, + "step": 4415 + }, + { + "epoch": 1.3404158445894674, + "grad_norm": 0.5612940788269043, + "learning_rate": 7.769565657588336e-05, + "loss": 2.1291, + "step": 4416 + }, + { + "epoch": 1.3407193807861588, + "grad_norm": 0.6868269443511963, + "learning_rate": 7.769059431001316e-05, + "loss": 1.6882, + "step": 4417 + }, + { + "epoch": 1.3410229169828503, + "grad_norm": 0.48497381806373596, + "learning_rate": 7.768553204414297e-05, + "loss": 1.6664, + "step": 4418 + }, + { + "epoch": 1.3413264531795417, + "grad_norm": 0.5606340169906616, + "learning_rate": 7.768046977827276e-05, + "loss": 1.8263, + "step": 4419 + }, + { + "epoch": 1.3416299893762331, + "grad_norm": 0.4336944818496704, + "learning_rate": 7.767540751240256e-05, + "loss": 1.8313, + "step": 4420 + }, + { + "epoch": 1.3419335255729246, + "grad_norm": 0.8053896427154541, + "learning_rate": 7.767034524653235e-05, + "loss": 1.5858, + "step": 4421 + }, + { + "epoch": 1.342237061769616, + "grad_norm": 0.4444625973701477, + "learning_rate": 7.766528298066215e-05, + "loss": 1.5288, + "step": 4422 + }, + { + "epoch": 1.3425405979663074, + "grad_norm": 0.5923985242843628, + "learning_rate": 7.766022071479194e-05, + "loss": 1.7149, + "step": 4423 + }, + { + "epoch": 1.3428441341629989, + "grad_norm": 0.5716237425804138, + "learning_rate": 7.765515844892174e-05, + "loss": 1.8926, + "step": 4424 + }, + { + "epoch": 1.3431476703596905, + "grad_norm": 0.5298818349838257, + "learning_rate": 7.765009618305153e-05, + "loss": 1.616, + "step": 4425 + }, + { + "epoch": 1.3434512065563817, + "grad_norm": 0.5272706151008606, + "learning_rate": 7.764503391718133e-05, + "loss": 2.0209, + "step": 4426 + }, + { + "epoch": 1.3437547427530734, + "grad_norm": 0.6190195083618164, + "learning_rate": 7.763997165131113e-05, + "loss": 1.4569, + "step": 4427 + }, + { + "epoch": 1.3440582789497648, + "grad_norm": 0.6583245992660522, + "learning_rate": 7.763490938544093e-05, + "loss": 1.5768, + "step": 4428 + }, + { + "epoch": 1.3443618151464563, + "grad_norm": 0.5891104936599731, + "learning_rate": 7.762984711957072e-05, + "loss": 1.7829, + "step": 4429 + }, + { + "epoch": 1.3446653513431477, + "grad_norm": 0.47343286871910095, + "learning_rate": 7.762478485370052e-05, + "loss": 1.6713, + "step": 4430 + }, + { + "epoch": 1.3449688875398391, + "grad_norm": 0.4860784113407135, + "learning_rate": 7.761972258783031e-05, + "loss": 1.4912, + "step": 4431 + }, + { + "epoch": 1.3452724237365306, + "grad_norm": 0.5806290507316589, + "learning_rate": 7.761466032196011e-05, + "loss": 1.9675, + "step": 4432 + }, + { + "epoch": 1.345575959933222, + "grad_norm": 0.7708868384361267, + "learning_rate": 7.76095980560899e-05, + "loss": 1.8951, + "step": 4433 + }, + { + "epoch": 1.3458794961299134, + "grad_norm": 0.4970077574253082, + "learning_rate": 7.76045357902197e-05, + "loss": 1.706, + "step": 4434 + }, + { + "epoch": 1.3461830323266049, + "grad_norm": 0.5021582841873169, + "learning_rate": 7.759947352434949e-05, + "loss": 1.693, + "step": 4435 + }, + { + "epoch": 1.3464865685232965, + "grad_norm": 0.511876106262207, + "learning_rate": 7.759441125847929e-05, + "loss": 1.5194, + "step": 4436 + }, + { + "epoch": 1.3467901047199877, + "grad_norm": 0.5340248942375183, + "learning_rate": 7.75893489926091e-05, + "loss": 1.5881, + "step": 4437 + }, + { + "epoch": 1.3470936409166794, + "grad_norm": 0.4671190083026886, + "learning_rate": 7.75842867267389e-05, + "loss": 1.6339, + "step": 4438 + }, + { + "epoch": 1.3473971771133708, + "grad_norm": 0.500554084777832, + "learning_rate": 7.75792244608687e-05, + "loss": 1.7419, + "step": 4439 + }, + { + "epoch": 1.3477007133100622, + "grad_norm": 0.5325165390968323, + "learning_rate": 7.75741621949985e-05, + "loss": 1.4884, + "step": 4440 + }, + { + "epoch": 1.3480042495067537, + "grad_norm": 0.5758363604545593, + "learning_rate": 7.756909992912829e-05, + "loss": 1.7129, + "step": 4441 + }, + { + "epoch": 1.3483077857034451, + "grad_norm": 0.44963330030441284, + "learning_rate": 7.756403766325808e-05, + "loss": 1.7194, + "step": 4442 + }, + { + "epoch": 1.3486113219001366, + "grad_norm": 0.49702900648117065, + "learning_rate": 7.755897539738788e-05, + "loss": 1.3038, + "step": 4443 + }, + { + "epoch": 1.348914858096828, + "grad_norm": 0.5857210755348206, + "learning_rate": 7.755391313151767e-05, + "loss": 1.282, + "step": 4444 + }, + { + "epoch": 1.3492183942935196, + "grad_norm": 0.47209036350250244, + "learning_rate": 7.754885086564747e-05, + "loss": 1.7731, + "step": 4445 + }, + { + "epoch": 1.3495219304902109, + "grad_norm": 0.6250527501106262, + "learning_rate": 7.754378859977726e-05, + "loss": 1.6755, + "step": 4446 + }, + { + "epoch": 1.3498254666869025, + "grad_norm": 0.5777345299720764, + "learning_rate": 7.753872633390706e-05, + "loss": 1.587, + "step": 4447 + }, + { + "epoch": 1.350129002883594, + "grad_norm": 0.5663403868675232, + "learning_rate": 7.753366406803687e-05, + "loss": 1.8542, + "step": 4448 + }, + { + "epoch": 1.3504325390802854, + "grad_norm": 0.4112309515476227, + "learning_rate": 7.752860180216666e-05, + "loss": 0.5226, + "step": 4449 + }, + { + "epoch": 1.3507360752769768, + "grad_norm": 0.5185359716415405, + "learning_rate": 7.752353953629645e-05, + "loss": 1.7451, + "step": 4450 + }, + { + "epoch": 1.3510396114736682, + "grad_norm": 0.4213547706604004, + "learning_rate": 7.751847727042625e-05, + "loss": 1.8753, + "step": 4451 + }, + { + "epoch": 1.3513431476703597, + "grad_norm": 0.563086211681366, + "learning_rate": 7.751341500455604e-05, + "loss": 1.6842, + "step": 4452 + }, + { + "epoch": 1.3516466838670511, + "grad_norm": 0.4909687042236328, + "learning_rate": 7.750835273868584e-05, + "loss": 1.252, + "step": 4453 + }, + { + "epoch": 1.3519502200637425, + "grad_norm": 0.3422612249851227, + "learning_rate": 7.750329047281563e-05, + "loss": 1.4584, + "step": 4454 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.5087230205535889, + "learning_rate": 7.749822820694543e-05, + "loss": 1.7198, + "step": 4455 + }, + { + "epoch": 1.3525572924571256, + "grad_norm": 0.5094039440155029, + "learning_rate": 7.749316594107522e-05, + "loss": 1.8156, + "step": 4456 + }, + { + "epoch": 1.3528608286538168, + "grad_norm": 0.5707986354827881, + "learning_rate": 7.748810367520503e-05, + "loss": 1.4866, + "step": 4457 + }, + { + "epoch": 1.3531643648505085, + "grad_norm": 0.5794532895088196, + "learning_rate": 7.748304140933483e-05, + "loss": 1.7428, + "step": 4458 + }, + { + "epoch": 1.3534679010472, + "grad_norm": 0.38065409660339355, + "learning_rate": 7.747797914346462e-05, + "loss": 1.2621, + "step": 4459 + }, + { + "epoch": 1.3537714372438914, + "grad_norm": 0.5278635025024414, + "learning_rate": 7.747291687759442e-05, + "loss": 1.7026, + "step": 4460 + }, + { + "epoch": 1.3540749734405828, + "grad_norm": 0.584745466709137, + "learning_rate": 7.746785461172421e-05, + "loss": 1.9313, + "step": 4461 + }, + { + "epoch": 1.3543785096372742, + "grad_norm": 0.5232474207878113, + "learning_rate": 7.7462792345854e-05, + "loss": 1.8206, + "step": 4462 + }, + { + "epoch": 1.3546820458339657, + "grad_norm": 0.647459089756012, + "learning_rate": 7.74577300799838e-05, + "loss": 1.3045, + "step": 4463 + }, + { + "epoch": 1.354985582030657, + "grad_norm": 0.5332021117210388, + "learning_rate": 7.74526678141136e-05, + "loss": 1.7948, + "step": 4464 + }, + { + "epoch": 1.3552891182273485, + "grad_norm": 0.5136265754699707, + "learning_rate": 7.744760554824339e-05, + "loss": 1.8639, + "step": 4465 + }, + { + "epoch": 1.35559265442404, + "grad_norm": 0.5178390741348267, + "learning_rate": 7.74425432823732e-05, + "loss": 1.3601, + "step": 4466 + }, + { + "epoch": 1.3558961906207316, + "grad_norm": 0.4487190246582031, + "learning_rate": 7.7437481016503e-05, + "loss": 1.7373, + "step": 4467 + }, + { + "epoch": 1.3561997268174228, + "grad_norm": 0.6040553450584412, + "learning_rate": 7.743241875063279e-05, + "loss": 1.1153, + "step": 4468 + }, + { + "epoch": 1.3565032630141145, + "grad_norm": 0.533661961555481, + "learning_rate": 7.742735648476258e-05, + "loss": 1.6861, + "step": 4469 + }, + { + "epoch": 1.356806799210806, + "grad_norm": 0.5596243739128113, + "learning_rate": 7.742229421889238e-05, + "loss": 1.6324, + "step": 4470 + }, + { + "epoch": 1.3571103354074974, + "grad_norm": 0.5296964645385742, + "learning_rate": 7.741723195302217e-05, + "loss": 1.7621, + "step": 4471 + }, + { + "epoch": 1.3574138716041888, + "grad_norm": 0.4589049518108368, + "learning_rate": 7.741216968715197e-05, + "loss": 1.7543, + "step": 4472 + }, + { + "epoch": 1.3577174078008802, + "grad_norm": 0.5302414298057556, + "learning_rate": 7.740710742128176e-05, + "loss": 1.5546, + "step": 4473 + }, + { + "epoch": 1.3580209439975717, + "grad_norm": 0.6762099862098694, + "learning_rate": 7.740204515541156e-05, + "loss": 1.3986, + "step": 4474 + }, + { + "epoch": 1.358324480194263, + "grad_norm": 0.4986850321292877, + "learning_rate": 7.739698288954135e-05, + "loss": 1.9552, + "step": 4475 + }, + { + "epoch": 1.3586280163909545, + "grad_norm": 0.602545976638794, + "learning_rate": 7.739192062367116e-05, + "loss": 1.1948, + "step": 4476 + }, + { + "epoch": 1.358931552587646, + "grad_norm": 0.5418416857719421, + "learning_rate": 7.738685835780095e-05, + "loss": 1.8975, + "step": 4477 + }, + { + "epoch": 1.3592350887843376, + "grad_norm": 0.5278483033180237, + "learning_rate": 7.738179609193075e-05, + "loss": 1.8907, + "step": 4478 + }, + { + "epoch": 1.359538624981029, + "grad_norm": 0.5483014583587646, + "learning_rate": 7.737673382606054e-05, + "loss": 1.6767, + "step": 4479 + }, + { + "epoch": 1.3598421611777205, + "grad_norm": 0.5430691838264465, + "learning_rate": 7.737167156019034e-05, + "loss": 1.7736, + "step": 4480 + }, + { + "epoch": 1.360145697374412, + "grad_norm": 0.5248533487319946, + "learning_rate": 7.736660929432013e-05, + "loss": 1.6338, + "step": 4481 + }, + { + "epoch": 1.3604492335711034, + "grad_norm": 0.49445077776908875, + "learning_rate": 7.736154702844994e-05, + "loss": 1.7626, + "step": 4482 + }, + { + "epoch": 1.3607527697677948, + "grad_norm": 0.91696697473526, + "learning_rate": 7.735648476257974e-05, + "loss": 1.5865, + "step": 4483 + }, + { + "epoch": 1.3610563059644862, + "grad_norm": 0.5618857145309448, + "learning_rate": 7.735142249670953e-05, + "loss": 1.6746, + "step": 4484 + }, + { + "epoch": 1.3613598421611777, + "grad_norm": 0.497670441865921, + "learning_rate": 7.734636023083933e-05, + "loss": 1.7111, + "step": 4485 + }, + { + "epoch": 1.361663378357869, + "grad_norm": 0.5272151827812195, + "learning_rate": 7.734129796496912e-05, + "loss": 1.7878, + "step": 4486 + }, + { + "epoch": 1.3619669145545608, + "grad_norm": 0.5168973803520203, + "learning_rate": 7.733623569909893e-05, + "loss": 1.8668, + "step": 4487 + }, + { + "epoch": 1.362270450751252, + "grad_norm": 0.7518470883369446, + "learning_rate": 7.733117343322872e-05, + "loss": 1.9678, + "step": 4488 + }, + { + "epoch": 1.3625739869479436, + "grad_norm": 0.5416634678840637, + "learning_rate": 7.732611116735852e-05, + "loss": 1.8942, + "step": 4489 + }, + { + "epoch": 1.362877523144635, + "grad_norm": 0.5445654988288879, + "learning_rate": 7.732104890148831e-05, + "loss": 1.1588, + "step": 4490 + }, + { + "epoch": 1.3631810593413265, + "grad_norm": 0.4440891146659851, + "learning_rate": 7.731598663561811e-05, + "loss": 1.0499, + "step": 4491 + }, + { + "epoch": 1.363484595538018, + "grad_norm": 0.4910653233528137, + "learning_rate": 7.73109243697479e-05, + "loss": 1.5553, + "step": 4492 + }, + { + "epoch": 1.3637881317347094, + "grad_norm": 0.438759446144104, + "learning_rate": 7.73058621038777e-05, + "loss": 1.7959, + "step": 4493 + }, + { + "epoch": 1.3640916679314008, + "grad_norm": 0.527281641960144, + "learning_rate": 7.73007998380075e-05, + "loss": 1.9108, + "step": 4494 + }, + { + "epoch": 1.3643952041280922, + "grad_norm": 0.5466772317886353, + "learning_rate": 7.729573757213729e-05, + "loss": 1.3945, + "step": 4495 + }, + { + "epoch": 1.3646987403247837, + "grad_norm": 0.47780725359916687, + "learning_rate": 7.72906753062671e-05, + "loss": 1.685, + "step": 4496 + }, + { + "epoch": 1.365002276521475, + "grad_norm": 0.5176924467086792, + "learning_rate": 7.728561304039689e-05, + "loss": 1.7529, + "step": 4497 + }, + { + "epoch": 1.3653058127181668, + "grad_norm": 0.572093665599823, + "learning_rate": 7.728055077452669e-05, + "loss": 1.8185, + "step": 4498 + }, + { + "epoch": 1.365609348914858, + "grad_norm": 0.6627562046051025, + "learning_rate": 7.727548850865648e-05, + "loss": 1.6821, + "step": 4499 + }, + { + "epoch": 1.3659128851115496, + "grad_norm": 0.4149744212627411, + "learning_rate": 7.727042624278628e-05, + "loss": 1.8283, + "step": 4500 + }, + { + "epoch": 1.366216421308241, + "grad_norm": 0.590709388256073, + "learning_rate": 7.726536397691607e-05, + "loss": 1.6729, + "step": 4501 + }, + { + "epoch": 1.3665199575049325, + "grad_norm": 0.5867372751235962, + "learning_rate": 7.726030171104587e-05, + "loss": 1.0463, + "step": 4502 + }, + { + "epoch": 1.366823493701624, + "grad_norm": 0.4657502770423889, + "learning_rate": 7.725523944517566e-05, + "loss": 1.6178, + "step": 4503 + }, + { + "epoch": 1.3671270298983154, + "grad_norm": 0.43361958861351013, + "learning_rate": 7.725017717930546e-05, + "loss": 1.7674, + "step": 4504 + }, + { + "epoch": 1.3674305660950068, + "grad_norm": 0.44244110584259033, + "learning_rate": 7.724511491343526e-05, + "loss": 1.7252, + "step": 4505 + }, + { + "epoch": 1.3677341022916982, + "grad_norm": 0.4858807921409607, + "learning_rate": 7.724005264756506e-05, + "loss": 1.698, + "step": 4506 + }, + { + "epoch": 1.3680376384883897, + "grad_norm": 0.46887320280075073, + "learning_rate": 7.723499038169485e-05, + "loss": 1.8538, + "step": 4507 + }, + { + "epoch": 1.368341174685081, + "grad_norm": 0.6876465678215027, + "learning_rate": 7.722992811582465e-05, + "loss": 1.3666, + "step": 4508 + }, + { + "epoch": 1.3686447108817728, + "grad_norm": 0.6491440534591675, + "learning_rate": 7.722486584995444e-05, + "loss": 1.4743, + "step": 4509 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.6107078790664673, + "learning_rate": 7.721980358408424e-05, + "loss": 1.8493, + "step": 4510 + }, + { + "epoch": 1.3692517832751556, + "grad_norm": 0.9721749424934387, + "learning_rate": 7.721474131821403e-05, + "loss": 1.2488, + "step": 4511 + }, + { + "epoch": 1.369555319471847, + "grad_norm": 0.4856562614440918, + "learning_rate": 7.720967905234383e-05, + "loss": 1.6012, + "step": 4512 + }, + { + "epoch": 1.3698588556685385, + "grad_norm": 0.5122790336608887, + "learning_rate": 7.720461678647362e-05, + "loss": 1.9067, + "step": 4513 + }, + { + "epoch": 1.37016239186523, + "grad_norm": 0.45854032039642334, + "learning_rate": 7.719955452060342e-05, + "loss": 1.6468, + "step": 4514 + }, + { + "epoch": 1.3704659280619214, + "grad_norm": 0.5521238446235657, + "learning_rate": 7.719449225473322e-05, + "loss": 1.6377, + "step": 4515 + }, + { + "epoch": 1.3707694642586128, + "grad_norm": 0.4813542068004608, + "learning_rate": 7.718942998886302e-05, + "loss": 1.704, + "step": 4516 + }, + { + "epoch": 1.3710730004553042, + "grad_norm": 0.604897141456604, + "learning_rate": 7.718436772299281e-05, + "loss": 1.7302, + "step": 4517 + }, + { + "epoch": 1.3713765366519959, + "grad_norm": 0.5965009927749634, + "learning_rate": 7.717930545712261e-05, + "loss": 1.7354, + "step": 4518 + }, + { + "epoch": 1.371680072848687, + "grad_norm": 0.4838474988937378, + "learning_rate": 7.71742431912524e-05, + "loss": 1.2562, + "step": 4519 + }, + { + "epoch": 1.3719836090453787, + "grad_norm": 0.5375049114227295, + "learning_rate": 7.71691809253822e-05, + "loss": 1.8191, + "step": 4520 + }, + { + "epoch": 1.3722871452420702, + "grad_norm": 0.47960102558135986, + "learning_rate": 7.7164118659512e-05, + "loss": 1.8247, + "step": 4521 + }, + { + "epoch": 1.3725906814387616, + "grad_norm": 0.42799845337867737, + "learning_rate": 7.715905639364179e-05, + "loss": 1.6597, + "step": 4522 + }, + { + "epoch": 1.372894217635453, + "grad_norm": 1.6170735359191895, + "learning_rate": 7.715399412777158e-05, + "loss": 1.4322, + "step": 4523 + }, + { + "epoch": 1.3731977538321445, + "grad_norm": 0.5388947129249573, + "learning_rate": 7.714893186190139e-05, + "loss": 1.2625, + "step": 4524 + }, + { + "epoch": 1.373501290028836, + "grad_norm": 1.5445969104766846, + "learning_rate": 7.714386959603119e-05, + "loss": 1.091, + "step": 4525 + }, + { + "epoch": 1.3738048262255274, + "grad_norm": 0.5684570074081421, + "learning_rate": 7.7138807330161e-05, + "loss": 1.4048, + "step": 4526 + }, + { + "epoch": 1.3741083624222188, + "grad_norm": 0.565920889377594, + "learning_rate": 7.713374506429079e-05, + "loss": 1.5052, + "step": 4527 + }, + { + "epoch": 1.3744118986189102, + "grad_norm": 0.5732594728469849, + "learning_rate": 7.712868279842058e-05, + "loss": 1.3838, + "step": 4528 + }, + { + "epoch": 1.3747154348156019, + "grad_norm": 0.6106382608413696, + "learning_rate": 7.712362053255038e-05, + "loss": 1.2531, + "step": 4529 + }, + { + "epoch": 1.375018971012293, + "grad_norm": 0.6688647270202637, + "learning_rate": 7.711855826668017e-05, + "loss": 1.7071, + "step": 4530 + }, + { + "epoch": 1.3753225072089847, + "grad_norm": 0.534449577331543, + "learning_rate": 7.711349600080997e-05, + "loss": 1.7455, + "step": 4531 + }, + { + "epoch": 1.3756260434056762, + "grad_norm": 0.5120007991790771, + "learning_rate": 7.710843373493976e-05, + "loss": 1.8533, + "step": 4532 + }, + { + "epoch": 1.3759295796023676, + "grad_norm": 0.5665999054908752, + "learning_rate": 7.710337146906956e-05, + "loss": 1.8525, + "step": 4533 + }, + { + "epoch": 1.376233115799059, + "grad_norm": 0.5839645266532898, + "learning_rate": 7.709830920319935e-05, + "loss": 1.5777, + "step": 4534 + }, + { + "epoch": 1.3765366519957505, + "grad_norm": 0.5277237296104431, + "learning_rate": 7.709324693732916e-05, + "loss": 1.8254, + "step": 4535 + }, + { + "epoch": 1.376840188192442, + "grad_norm": 0.53985196352005, + "learning_rate": 7.708818467145896e-05, + "loss": 1.3575, + "step": 4536 + }, + { + "epoch": 1.3771437243891334, + "grad_norm": 0.5370942950248718, + "learning_rate": 7.708312240558875e-05, + "loss": 1.6435, + "step": 4537 + }, + { + "epoch": 1.3774472605858248, + "grad_norm": 0.4175781011581421, + "learning_rate": 7.707806013971855e-05, + "loss": 1.4873, + "step": 4538 + }, + { + "epoch": 1.3777507967825162, + "grad_norm": 0.5643919110298157, + "learning_rate": 7.707299787384834e-05, + "loss": 1.8618, + "step": 4539 + }, + { + "epoch": 1.3780543329792079, + "grad_norm": 0.5539401769638062, + "learning_rate": 7.706793560797814e-05, + "loss": 1.837, + "step": 4540 + }, + { + "epoch": 1.3783578691758993, + "grad_norm": 0.4684971570968628, + "learning_rate": 7.706287334210793e-05, + "loss": 1.7192, + "step": 4541 + }, + { + "epoch": 1.3786614053725907, + "grad_norm": 0.5881449580192566, + "learning_rate": 7.705781107623773e-05, + "loss": 0.9102, + "step": 4542 + }, + { + "epoch": 1.3789649415692822, + "grad_norm": 0.515137255191803, + "learning_rate": 7.705274881036752e-05, + "loss": 1.3175, + "step": 4543 + }, + { + "epoch": 1.3792684777659736, + "grad_norm": 0.5105268955230713, + "learning_rate": 7.704768654449733e-05, + "loss": 1.7881, + "step": 4544 + }, + { + "epoch": 1.379572013962665, + "grad_norm": 0.5810350179672241, + "learning_rate": 7.704262427862712e-05, + "loss": 1.1397, + "step": 4545 + }, + { + "epoch": 1.3798755501593565, + "grad_norm": 0.5091977715492249, + "learning_rate": 7.703756201275692e-05, + "loss": 2.0762, + "step": 4546 + }, + { + "epoch": 1.380179086356048, + "grad_norm": 0.5842112302780151, + "learning_rate": 7.703249974688671e-05, + "loss": 1.8471, + "step": 4547 + }, + { + "epoch": 1.3804826225527393, + "grad_norm": 0.890228807926178, + "learning_rate": 7.702743748101651e-05, + "loss": 1.3657, + "step": 4548 + }, + { + "epoch": 1.380786158749431, + "grad_norm": 0.6264392733573914, + "learning_rate": 7.70223752151463e-05, + "loss": 1.7772, + "step": 4549 + }, + { + "epoch": 1.3810896949461222, + "grad_norm": 0.5175667405128479, + "learning_rate": 7.70173129492761e-05, + "loss": 1.598, + "step": 4550 + }, + { + "epoch": 1.3813932311428139, + "grad_norm": 0.5696103572845459, + "learning_rate": 7.701225068340589e-05, + "loss": 1.8323, + "step": 4551 + }, + { + "epoch": 1.3816967673395053, + "grad_norm": 0.6493420600891113, + "learning_rate": 7.700718841753569e-05, + "loss": 1.1722, + "step": 4552 + }, + { + "epoch": 1.3820003035361967, + "grad_norm": 0.5873674154281616, + "learning_rate": 7.700212615166548e-05, + "loss": 1.7187, + "step": 4553 + }, + { + "epoch": 1.3823038397328882, + "grad_norm": 0.5197107791900635, + "learning_rate": 7.699706388579529e-05, + "loss": 1.3359, + "step": 4554 + }, + { + "epoch": 1.3826073759295796, + "grad_norm": 0.7124143838882446, + "learning_rate": 7.699200161992508e-05, + "loss": 1.4417, + "step": 4555 + }, + { + "epoch": 1.382910912126271, + "grad_norm": 0.49033257365226746, + "learning_rate": 7.698693935405488e-05, + "loss": 1.7348, + "step": 4556 + }, + { + "epoch": 1.3832144483229625, + "grad_norm": 0.5684939622879028, + "learning_rate": 7.698187708818467e-05, + "loss": 1.4596, + "step": 4557 + }, + { + "epoch": 1.383517984519654, + "grad_norm": 0.5147912502288818, + "learning_rate": 7.697681482231447e-05, + "loss": 2.0594, + "step": 4558 + }, + { + "epoch": 1.3838215207163453, + "grad_norm": 0.5846849679946899, + "learning_rate": 7.697175255644426e-05, + "loss": 1.6377, + "step": 4559 + }, + { + "epoch": 1.384125056913037, + "grad_norm": 0.4074210822582245, + "learning_rate": 7.696669029057406e-05, + "loss": 0.914, + "step": 4560 + }, + { + "epoch": 1.3844285931097282, + "grad_norm": 0.5171768069267273, + "learning_rate": 7.696162802470385e-05, + "loss": 1.5171, + "step": 4561 + }, + { + "epoch": 1.3847321293064199, + "grad_norm": 0.4810327887535095, + "learning_rate": 7.695656575883365e-05, + "loss": 1.8348, + "step": 4562 + }, + { + "epoch": 1.3850356655031113, + "grad_norm": 0.49266764521598816, + "learning_rate": 7.695150349296346e-05, + "loss": 1.8055, + "step": 4563 + }, + { + "epoch": 1.3853392016998027, + "grad_norm": 0.7078859210014343, + "learning_rate": 7.694644122709325e-05, + "loss": 2.0686, + "step": 4564 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.5220522880554199, + "learning_rate": 7.694137896122305e-05, + "loss": 1.7709, + "step": 4565 + }, + { + "epoch": 1.3859462740931856, + "grad_norm": 0.5483274459838867, + "learning_rate": 7.693631669535284e-05, + "loss": 1.3725, + "step": 4566 + }, + { + "epoch": 1.386249810289877, + "grad_norm": 0.5607746243476868, + "learning_rate": 7.693125442948264e-05, + "loss": 1.5361, + "step": 4567 + }, + { + "epoch": 1.3865533464865685, + "grad_norm": 0.5287503004074097, + "learning_rate": 7.692619216361243e-05, + "loss": 1.4119, + "step": 4568 + }, + { + "epoch": 1.38685688268326, + "grad_norm": 0.7318875193595886, + "learning_rate": 7.692112989774223e-05, + "loss": 1.7289, + "step": 4569 + }, + { + "epoch": 1.3871604188799513, + "grad_norm": 1.9280884265899658, + "learning_rate": 7.691606763187202e-05, + "loss": 2.1525, + "step": 4570 + }, + { + "epoch": 1.387463955076643, + "grad_norm": 0.456725537776947, + "learning_rate": 7.691100536600183e-05, + "loss": 1.2377, + "step": 4571 + }, + { + "epoch": 1.3877674912733342, + "grad_norm": 0.541840672492981, + "learning_rate": 7.690594310013162e-05, + "loss": 1.3657, + "step": 4572 + }, + { + "epoch": 1.3880710274700259, + "grad_norm": 0.47133737802505493, + "learning_rate": 7.690088083426142e-05, + "loss": 1.8308, + "step": 4573 + }, + { + "epoch": 1.3883745636667173, + "grad_norm": 0.517042875289917, + "learning_rate": 7.689581856839123e-05, + "loss": 1.8143, + "step": 4574 + }, + { + "epoch": 1.3886780998634087, + "grad_norm": 0.6014066338539124, + "learning_rate": 7.689075630252102e-05, + "loss": 1.6925, + "step": 4575 + }, + { + "epoch": 1.3889816360601002, + "grad_norm": 0.4809291958808899, + "learning_rate": 7.688569403665082e-05, + "loss": 1.9902, + "step": 4576 + }, + { + "epoch": 1.3892851722567916, + "grad_norm": 0.6222968101501465, + "learning_rate": 7.688063177078061e-05, + "loss": 1.701, + "step": 4577 + }, + { + "epoch": 1.389588708453483, + "grad_norm": 0.4806516468524933, + "learning_rate": 7.68755695049104e-05, + "loss": 2.0006, + "step": 4578 + }, + { + "epoch": 1.3898922446501745, + "grad_norm": 0.6643034815788269, + "learning_rate": 7.68705072390402e-05, + "loss": 1.7205, + "step": 4579 + }, + { + "epoch": 1.3901957808468661, + "grad_norm": 0.6698145270347595, + "learning_rate": 7.686544497317e-05, + "loss": 1.7128, + "step": 4580 + }, + { + "epoch": 1.3904993170435573, + "grad_norm": 0.5213164687156677, + "learning_rate": 7.686038270729979e-05, + "loss": 1.5694, + "step": 4581 + }, + { + "epoch": 1.390802853240249, + "grad_norm": 0.5646512508392334, + "learning_rate": 7.685532044142958e-05, + "loss": 1.6217, + "step": 4582 + }, + { + "epoch": 1.3911063894369404, + "grad_norm": 0.6456514000892639, + "learning_rate": 7.685025817555939e-05, + "loss": 1.6194, + "step": 4583 + }, + { + "epoch": 1.3914099256336319, + "grad_norm": 0.6872454881668091, + "learning_rate": 7.684519590968919e-05, + "loss": 1.5893, + "step": 4584 + }, + { + "epoch": 1.3917134618303233, + "grad_norm": 0.36732757091522217, + "learning_rate": 7.684013364381898e-05, + "loss": 1.3996, + "step": 4585 + }, + { + "epoch": 1.3920169980270147, + "grad_norm": 0.4791436791419983, + "learning_rate": 7.683507137794878e-05, + "loss": 1.386, + "step": 4586 + }, + { + "epoch": 1.3923205342237062, + "grad_norm": 0.4628514051437378, + "learning_rate": 7.683000911207857e-05, + "loss": 1.483, + "step": 4587 + }, + { + "epoch": 1.3926240704203976, + "grad_norm": 0.5104525089263916, + "learning_rate": 7.682494684620837e-05, + "loss": 1.8155, + "step": 4588 + }, + { + "epoch": 1.392927606617089, + "grad_norm": 0.49548450112342834, + "learning_rate": 7.681988458033816e-05, + "loss": 1.6658, + "step": 4589 + }, + { + "epoch": 1.3932311428137805, + "grad_norm": 0.6759939193725586, + "learning_rate": 7.681482231446796e-05, + "loss": 1.798, + "step": 4590 + }, + { + "epoch": 1.3935346790104721, + "grad_norm": 0.5013139843940735, + "learning_rate": 7.680976004859775e-05, + "loss": 1.4767, + "step": 4591 + }, + { + "epoch": 1.3938382152071633, + "grad_norm": 0.5529112219810486, + "learning_rate": 7.680469778272755e-05, + "loss": 2.0458, + "step": 4592 + }, + { + "epoch": 1.394141751403855, + "grad_norm": 0.4460000991821289, + "learning_rate": 7.679963551685735e-05, + "loss": 1.4609, + "step": 4593 + }, + { + "epoch": 1.3944452876005464, + "grad_norm": 0.5933369994163513, + "learning_rate": 7.679457325098715e-05, + "loss": 1.4755, + "step": 4594 + }, + { + "epoch": 1.3947488237972379, + "grad_norm": 0.5485585927963257, + "learning_rate": 7.678951098511694e-05, + "loss": 1.5591, + "step": 4595 + }, + { + "epoch": 1.3950523599939293, + "grad_norm": 0.5463452339172363, + "learning_rate": 7.678444871924674e-05, + "loss": 1.4816, + "step": 4596 + }, + { + "epoch": 1.3953558961906207, + "grad_norm": 0.5605781078338623, + "learning_rate": 7.677938645337653e-05, + "loss": 1.4213, + "step": 4597 + }, + { + "epoch": 1.3956594323873122, + "grad_norm": 0.5562348961830139, + "learning_rate": 7.677432418750633e-05, + "loss": 1.6317, + "step": 4598 + }, + { + "epoch": 1.3959629685840036, + "grad_norm": 0.5632421970367432, + "learning_rate": 7.676926192163612e-05, + "loss": 1.6231, + "step": 4599 + }, + { + "epoch": 1.396266504780695, + "grad_norm": 0.5117442011833191, + "learning_rate": 7.676419965576592e-05, + "loss": 1.535, + "step": 4600 + }, + { + "epoch": 1.3965700409773865, + "grad_norm": 0.576318085193634, + "learning_rate": 7.675913738989571e-05, + "loss": 1.6773, + "step": 4601 + }, + { + "epoch": 1.3968735771740781, + "grad_norm": 0.5794771909713745, + "learning_rate": 7.675407512402552e-05, + "loss": 1.3376, + "step": 4602 + }, + { + "epoch": 1.3971771133707693, + "grad_norm": 0.519234299659729, + "learning_rate": 7.674901285815532e-05, + "loss": 1.4872, + "step": 4603 + }, + { + "epoch": 1.397480649567461, + "grad_norm": 0.48116299510002136, + "learning_rate": 7.674395059228511e-05, + "loss": 1.7884, + "step": 4604 + }, + { + "epoch": 1.3977841857641524, + "grad_norm": 0.4485664367675781, + "learning_rate": 7.67388883264149e-05, + "loss": 1.7424, + "step": 4605 + }, + { + "epoch": 1.3980877219608439, + "grad_norm": 0.5528675317764282, + "learning_rate": 7.67338260605447e-05, + "loss": 1.5698, + "step": 4606 + }, + { + "epoch": 1.3983912581575353, + "grad_norm": 0.6204590797424316, + "learning_rate": 7.67287637946745e-05, + "loss": 1.5903, + "step": 4607 + }, + { + "epoch": 1.3986947943542267, + "grad_norm": 0.5104626417160034, + "learning_rate": 7.672370152880429e-05, + "loss": 1.7593, + "step": 4608 + }, + { + "epoch": 1.3989983305509182, + "grad_norm": 0.5637308359146118, + "learning_rate": 7.671863926293408e-05, + "loss": 1.459, + "step": 4609 + }, + { + "epoch": 1.3993018667476096, + "grad_norm": 0.4793213903903961, + "learning_rate": 7.671357699706388e-05, + "loss": 1.5054, + "step": 4610 + }, + { + "epoch": 1.399605402944301, + "grad_norm": 0.5412987470626831, + "learning_rate": 7.670851473119369e-05, + "loss": 1.5059, + "step": 4611 + }, + { + "epoch": 1.3999089391409925, + "grad_norm": 0.5955209136009216, + "learning_rate": 7.670345246532348e-05, + "loss": 1.5, + "step": 4612 + }, + { + "epoch": 1.4002124753376841, + "grad_norm": 2.214019536972046, + "learning_rate": 7.669839019945328e-05, + "loss": 0.8095, + "step": 4613 + }, + { + "epoch": 1.4005160115343755, + "grad_norm": 0.5414422154426575, + "learning_rate": 7.669332793358307e-05, + "loss": 1.5825, + "step": 4614 + }, + { + "epoch": 1.400819547731067, + "grad_norm": 0.564838707447052, + "learning_rate": 7.668826566771288e-05, + "loss": 1.645, + "step": 4615 + }, + { + "epoch": 1.4011230839277584, + "grad_norm": 0.5650699734687805, + "learning_rate": 7.668320340184268e-05, + "loss": 1.3399, + "step": 4616 + }, + { + "epoch": 1.4014266201244499, + "grad_norm": 0.6401330232620239, + "learning_rate": 7.667814113597247e-05, + "loss": 1.9951, + "step": 4617 + }, + { + "epoch": 1.4017301563211413, + "grad_norm": 0.586685836315155, + "learning_rate": 7.667307887010226e-05, + "loss": 1.4413, + "step": 4618 + }, + { + "epoch": 1.4020336925178327, + "grad_norm": 0.551403284072876, + "learning_rate": 7.666801660423206e-05, + "loss": 1.693, + "step": 4619 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4873546361923218, + "learning_rate": 7.666295433836185e-05, + "loss": 1.8064, + "step": 4620 + }, + { + "epoch": 1.4026407649112156, + "grad_norm": 0.6161354780197144, + "learning_rate": 7.665789207249165e-05, + "loss": 1.5373, + "step": 4621 + }, + { + "epoch": 1.4029443011079072, + "grad_norm": 0.556962788105011, + "learning_rate": 7.665282980662146e-05, + "loss": 1.5957, + "step": 4622 + }, + { + "epoch": 1.4032478373045985, + "grad_norm": 0.566043496131897, + "learning_rate": 7.664776754075125e-05, + "loss": 1.5312, + "step": 4623 + }, + { + "epoch": 1.4035513735012901, + "grad_norm": 0.5295618772506714, + "learning_rate": 7.664270527488105e-05, + "loss": 1.7293, + "step": 4624 + }, + { + "epoch": 1.4038549096979815, + "grad_norm": 0.47450530529022217, + "learning_rate": 7.663764300901084e-05, + "loss": 1.6276, + "step": 4625 + }, + { + "epoch": 1.404158445894673, + "grad_norm": 0.5055714249610901, + "learning_rate": 7.663258074314064e-05, + "loss": 1.8185, + "step": 4626 + }, + { + "epoch": 1.4044619820913644, + "grad_norm": 0.43849778175354004, + "learning_rate": 7.662751847727043e-05, + "loss": 1.2328, + "step": 4627 + }, + { + "epoch": 1.4047655182880558, + "grad_norm": 0.49399152398109436, + "learning_rate": 7.662245621140023e-05, + "loss": 1.4324, + "step": 4628 + }, + { + "epoch": 1.4050690544847473, + "grad_norm": 2.6655149459838867, + "learning_rate": 7.661739394553002e-05, + "loss": 1.0911, + "step": 4629 + }, + { + "epoch": 1.4053725906814387, + "grad_norm": 0.5486051440238953, + "learning_rate": 7.661233167965982e-05, + "loss": 1.6358, + "step": 4630 + }, + { + "epoch": 1.4056761268781301, + "grad_norm": 0.4878422021865845, + "learning_rate": 7.660726941378961e-05, + "loss": 1.3881, + "step": 4631 + }, + { + "epoch": 1.4059796630748216, + "grad_norm": 0.5612585544586182, + "learning_rate": 7.660220714791942e-05, + "loss": 2.0723, + "step": 4632 + }, + { + "epoch": 1.4062831992715132, + "grad_norm": 0.579932689666748, + "learning_rate": 7.659714488204921e-05, + "loss": 1.3426, + "step": 4633 + }, + { + "epoch": 1.4065867354682045, + "grad_norm": 0.4245831370353699, + "learning_rate": 7.659208261617901e-05, + "loss": 1.4925, + "step": 4634 + }, + { + "epoch": 1.406890271664896, + "grad_norm": 0.7544228434562683, + "learning_rate": 7.65870203503088e-05, + "loss": 1.5973, + "step": 4635 + }, + { + "epoch": 1.4071938078615875, + "grad_norm": 0.4973390996456146, + "learning_rate": 7.65819580844386e-05, + "loss": 1.6748, + "step": 4636 + }, + { + "epoch": 1.407497344058279, + "grad_norm": 0.5662533044815063, + "learning_rate": 7.657689581856839e-05, + "loss": 2.0457, + "step": 4637 + }, + { + "epoch": 1.4078008802549704, + "grad_norm": 0.5135043859481812, + "learning_rate": 7.657183355269819e-05, + "loss": 1.8762, + "step": 4638 + }, + { + "epoch": 1.4081044164516618, + "grad_norm": 0.5957035422325134, + "learning_rate": 7.656677128682798e-05, + "loss": 1.6636, + "step": 4639 + }, + { + "epoch": 1.4084079526483533, + "grad_norm": 0.5083897113800049, + "learning_rate": 7.656170902095778e-05, + "loss": 1.8611, + "step": 4640 + }, + { + "epoch": 1.4087114888450447, + "grad_norm": 0.5058372020721436, + "learning_rate": 7.655664675508759e-05, + "loss": 1.8053, + "step": 4641 + }, + { + "epoch": 1.4090150250417361, + "grad_norm": 0.5841237306594849, + "learning_rate": 7.655158448921738e-05, + "loss": 1.1644, + "step": 4642 + }, + { + "epoch": 1.4093185612384276, + "grad_norm": 0.5634949803352356, + "learning_rate": 7.654652222334718e-05, + "loss": 1.4692, + "step": 4643 + }, + { + "epoch": 1.4096220974351192, + "grad_norm": 0.9778512716293335, + "learning_rate": 7.654145995747697e-05, + "loss": 1.4193, + "step": 4644 + }, + { + "epoch": 1.4099256336318107, + "grad_norm": 0.5774378776550293, + "learning_rate": 7.653639769160677e-05, + "loss": 1.9765, + "step": 4645 + }, + { + "epoch": 1.410229169828502, + "grad_norm": 0.552975594997406, + "learning_rate": 7.653133542573656e-05, + "loss": 1.4232, + "step": 4646 + }, + { + "epoch": 1.4105327060251935, + "grad_norm": 0.4814278483390808, + "learning_rate": 7.652627315986635e-05, + "loss": 1.6995, + "step": 4647 + }, + { + "epoch": 1.410836242221885, + "grad_norm": 0.5448489189147949, + "learning_rate": 7.652121089399615e-05, + "loss": 1.6537, + "step": 4648 + }, + { + "epoch": 1.4111397784185764, + "grad_norm": 0.5354534387588501, + "learning_rate": 7.651614862812594e-05, + "loss": 1.5464, + "step": 4649 + }, + { + "epoch": 1.4114433146152678, + "grad_norm": 0.507394015789032, + "learning_rate": 7.651108636225575e-05, + "loss": 1.955, + "step": 4650 + }, + { + "epoch": 1.4117468508119593, + "grad_norm": 0.49544399976730347, + "learning_rate": 7.650602409638555e-05, + "loss": 1.8493, + "step": 4651 + }, + { + "epoch": 1.4120503870086507, + "grad_norm": 0.5341572165489197, + "learning_rate": 7.650096183051534e-05, + "loss": 1.8316, + "step": 4652 + }, + { + "epoch": 1.4123539232053424, + "grad_norm": 0.5258657932281494, + "learning_rate": 7.649589956464514e-05, + "loss": 1.8353, + "step": 4653 + }, + { + "epoch": 1.4126574594020336, + "grad_norm": 0.4699995517730713, + "learning_rate": 7.649083729877493e-05, + "loss": 1.6508, + "step": 4654 + }, + { + "epoch": 1.4129609955987252, + "grad_norm": 0.5846216082572937, + "learning_rate": 7.648577503290473e-05, + "loss": 1.3771, + "step": 4655 + }, + { + "epoch": 1.4132645317954167, + "grad_norm": 0.5015813112258911, + "learning_rate": 7.648071276703452e-05, + "loss": 1.8213, + "step": 4656 + }, + { + "epoch": 1.413568067992108, + "grad_norm": 0.5818765759468079, + "learning_rate": 7.647565050116432e-05, + "loss": 1.5861, + "step": 4657 + }, + { + "epoch": 1.4138716041887995, + "grad_norm": 0.5784850716590881, + "learning_rate": 7.647058823529411e-05, + "loss": 1.0852, + "step": 4658 + }, + { + "epoch": 1.414175140385491, + "grad_norm": 0.5787924528121948, + "learning_rate": 7.64655259694239e-05, + "loss": 1.8358, + "step": 4659 + }, + { + "epoch": 1.4144786765821824, + "grad_norm": 0.5465903282165527, + "learning_rate": 7.646046370355371e-05, + "loss": 1.7018, + "step": 4660 + }, + { + "epoch": 1.4147822127788738, + "grad_norm": 0.5020646452903748, + "learning_rate": 7.645540143768352e-05, + "loss": 1.8004, + "step": 4661 + }, + { + "epoch": 1.4150857489755653, + "grad_norm": 0.5645866990089417, + "learning_rate": 7.645033917181332e-05, + "loss": 1.255, + "step": 4662 + }, + { + "epoch": 1.4153892851722567, + "grad_norm": 0.4355092942714691, + "learning_rate": 7.644527690594311e-05, + "loss": 1.1957, + "step": 4663 + }, + { + "epoch": 1.4156928213689484, + "grad_norm": 0.6315711140632629, + "learning_rate": 7.644021464007291e-05, + "loss": 1.2085, + "step": 4664 + }, + { + "epoch": 1.4159963575656396, + "grad_norm": 0.47501039505004883, + "learning_rate": 7.64351523742027e-05, + "loss": 2.0933, + "step": 4665 + }, + { + "epoch": 1.4162998937623312, + "grad_norm": 0.45434871315956116, + "learning_rate": 7.64300901083325e-05, + "loss": 1.2583, + "step": 4666 + }, + { + "epoch": 1.4166034299590227, + "grad_norm": 0.6555678844451904, + "learning_rate": 7.642502784246229e-05, + "loss": 1.5034, + "step": 4667 + }, + { + "epoch": 1.416906966155714, + "grad_norm": 0.5302774310112, + "learning_rate": 7.641996557659209e-05, + "loss": 2.0505, + "step": 4668 + }, + { + "epoch": 1.4172105023524055, + "grad_norm": 0.4992988109588623, + "learning_rate": 7.641490331072188e-05, + "loss": 1.492, + "step": 4669 + }, + { + "epoch": 1.417514038549097, + "grad_norm": 0.5464704036712646, + "learning_rate": 7.640984104485168e-05, + "loss": 1.8207, + "step": 4670 + }, + { + "epoch": 1.4178175747457884, + "grad_norm": 0.60384601354599, + "learning_rate": 7.640477877898148e-05, + "loss": 1.6035, + "step": 4671 + }, + { + "epoch": 1.4181211109424798, + "grad_norm": 0.5650917887687683, + "learning_rate": 7.639971651311128e-05, + "loss": 1.6211, + "step": 4672 + }, + { + "epoch": 1.4184246471391713, + "grad_norm": 0.6552462577819824, + "learning_rate": 7.639465424724107e-05, + "loss": 1.5755, + "step": 4673 + }, + { + "epoch": 1.4187281833358627, + "grad_norm": 0.5759985446929932, + "learning_rate": 7.638959198137087e-05, + "loss": 1.4893, + "step": 4674 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.5508556365966797, + "learning_rate": 7.638452971550066e-05, + "loss": 1.6477, + "step": 4675 + }, + { + "epoch": 1.4193352557292458, + "grad_norm": 0.48576992750167847, + "learning_rate": 7.637946744963046e-05, + "loss": 1.8028, + "step": 4676 + }, + { + "epoch": 1.4196387919259372, + "grad_norm": 1.5687899589538574, + "learning_rate": 7.637440518376025e-05, + "loss": 1.4341, + "step": 4677 + }, + { + "epoch": 1.4199423281226287, + "grad_norm": 0.4605062007904053, + "learning_rate": 7.636934291789005e-05, + "loss": 1.6236, + "step": 4678 + }, + { + "epoch": 1.42024586431932, + "grad_norm": 0.5490358471870422, + "learning_rate": 7.636428065201984e-05, + "loss": 1.4707, + "step": 4679 + }, + { + "epoch": 1.4205494005160115, + "grad_norm": 0.5529788732528687, + "learning_rate": 7.635921838614965e-05, + "loss": 1.5166, + "step": 4680 + }, + { + "epoch": 1.420852936712703, + "grad_norm": 0.5166842341423035, + "learning_rate": 7.635415612027945e-05, + "loss": 1.7152, + "step": 4681 + }, + { + "epoch": 1.4211564729093944, + "grad_norm": 0.5218782424926758, + "learning_rate": 7.634909385440924e-05, + "loss": 1.4112, + "step": 4682 + }, + { + "epoch": 1.4214600091060858, + "grad_norm": 0.5360171794891357, + "learning_rate": 7.634403158853904e-05, + "loss": 1.9449, + "step": 4683 + }, + { + "epoch": 1.4217635453027775, + "grad_norm": 0.45122429728507996, + "learning_rate": 7.633896932266883e-05, + "loss": 1.5348, + "step": 4684 + }, + { + "epoch": 1.4220670814994687, + "grad_norm": 0.541969895362854, + "learning_rate": 7.633390705679862e-05, + "loss": 1.549, + "step": 4685 + }, + { + "epoch": 1.4223706176961604, + "grad_norm": 0.45322978496551514, + "learning_rate": 7.632884479092842e-05, + "loss": 1.6838, + "step": 4686 + }, + { + "epoch": 1.4226741538928518, + "grad_norm": 0.5295718908309937, + "learning_rate": 7.632378252505821e-05, + "loss": 1.539, + "step": 4687 + }, + { + "epoch": 1.4229776900895432, + "grad_norm": 0.49040156602859497, + "learning_rate": 7.631872025918801e-05, + "loss": 1.8898, + "step": 4688 + }, + { + "epoch": 1.4232812262862347, + "grad_norm": 0.5323116183280945, + "learning_rate": 7.631365799331782e-05, + "loss": 1.818, + "step": 4689 + }, + { + "epoch": 1.423584762482926, + "grad_norm": 0.5131807327270508, + "learning_rate": 7.630859572744761e-05, + "loss": 1.4028, + "step": 4690 + }, + { + "epoch": 1.4238882986796175, + "grad_norm": 0.5579325556755066, + "learning_rate": 7.630353346157741e-05, + "loss": 1.4887, + "step": 4691 + }, + { + "epoch": 1.424191834876309, + "grad_norm": 0.5273114442825317, + "learning_rate": 7.62984711957072e-05, + "loss": 1.9394, + "step": 4692 + }, + { + "epoch": 1.4244953710730004, + "grad_norm": 0.5281380414962769, + "learning_rate": 7.6293408929837e-05, + "loss": 1.359, + "step": 4693 + }, + { + "epoch": 1.4247989072696918, + "grad_norm": 0.5331006050109863, + "learning_rate": 7.628834666396679e-05, + "loss": 1.6283, + "step": 4694 + }, + { + "epoch": 1.4251024434663835, + "grad_norm": 0.6110636591911316, + "learning_rate": 7.628328439809659e-05, + "loss": 1.4206, + "step": 4695 + }, + { + "epoch": 1.4254059796630747, + "grad_norm": 0.5301719903945923, + "learning_rate": 7.627822213222638e-05, + "loss": 1.9281, + "step": 4696 + }, + { + "epoch": 1.4257095158597664, + "grad_norm": 0.5527634620666504, + "learning_rate": 7.627315986635618e-05, + "loss": 1.5625, + "step": 4697 + }, + { + "epoch": 1.4260130520564578, + "grad_norm": 0.5319656729698181, + "learning_rate": 7.626809760048597e-05, + "loss": 1.67, + "step": 4698 + }, + { + "epoch": 1.4263165882531492, + "grad_norm": 0.5614206790924072, + "learning_rate": 7.626303533461578e-05, + "loss": 1.7777, + "step": 4699 + }, + { + "epoch": 1.4266201244498407, + "grad_norm": 0.5032866597175598, + "learning_rate": 7.625797306874557e-05, + "loss": 1.9566, + "step": 4700 + }, + { + "epoch": 1.426923660646532, + "grad_norm": 0.5314932465553284, + "learning_rate": 7.625291080287537e-05, + "loss": 1.6655, + "step": 4701 + }, + { + "epoch": 1.4272271968432235, + "grad_norm": 0.47711166739463806, + "learning_rate": 7.624784853700516e-05, + "loss": 1.6008, + "step": 4702 + }, + { + "epoch": 1.427530733039915, + "grad_norm": 0.5699756741523743, + "learning_rate": 7.624278627113496e-05, + "loss": 1.5925, + "step": 4703 + }, + { + "epoch": 1.4278342692366064, + "grad_norm": 0.5623230338096619, + "learning_rate": 7.623772400526477e-05, + "loss": 1.7858, + "step": 4704 + }, + { + "epoch": 1.4281378054332978, + "grad_norm": 0.5966967344284058, + "learning_rate": 7.623266173939456e-05, + "loss": 1.0612, + "step": 4705 + }, + { + "epoch": 1.4284413416299895, + "grad_norm": 0.492683082818985, + "learning_rate": 7.622759947352436e-05, + "loss": 1.895, + "step": 4706 + }, + { + "epoch": 1.428744877826681, + "grad_norm": 0.5114320516586304, + "learning_rate": 7.622253720765415e-05, + "loss": 1.9176, + "step": 4707 + }, + { + "epoch": 1.4290484140233723, + "grad_norm": 0.47429636120796204, + "learning_rate": 7.621747494178395e-05, + "loss": 2.1511, + "step": 4708 + }, + { + "epoch": 1.4293519502200638, + "grad_norm": 0.5218935012817383, + "learning_rate": 7.621241267591374e-05, + "loss": 1.5853, + "step": 4709 + }, + { + "epoch": 1.4296554864167552, + "grad_norm": 0.5426785349845886, + "learning_rate": 7.620735041004355e-05, + "loss": 1.8157, + "step": 4710 + }, + { + "epoch": 1.4299590226134467, + "grad_norm": 0.5341148972511292, + "learning_rate": 7.620228814417334e-05, + "loss": 1.697, + "step": 4711 + }, + { + "epoch": 1.430262558810138, + "grad_norm": 0.7054175734519958, + "learning_rate": 7.619722587830314e-05, + "loss": 1.5556, + "step": 4712 + }, + { + "epoch": 1.4305660950068295, + "grad_norm": 0.903923749923706, + "learning_rate": 7.619216361243293e-05, + "loss": 1.6436, + "step": 4713 + }, + { + "epoch": 1.430869631203521, + "grad_norm": 0.48794859647750854, + "learning_rate": 7.618710134656273e-05, + "loss": 1.809, + "step": 4714 + }, + { + "epoch": 1.4311731674002126, + "grad_norm": 0.47096773982048035, + "learning_rate": 7.618203908069252e-05, + "loss": 1.6649, + "step": 4715 + }, + { + "epoch": 1.4314767035969038, + "grad_norm": 0.6093799471855164, + "learning_rate": 7.617697681482232e-05, + "loss": 1.8031, + "step": 4716 + }, + { + "epoch": 1.4317802397935955, + "grad_norm": 0.5231300592422485, + "learning_rate": 7.617191454895211e-05, + "loss": 1.7465, + "step": 4717 + }, + { + "epoch": 1.432083775990287, + "grad_norm": 0.545767605304718, + "learning_rate": 7.616685228308191e-05, + "loss": 0.9108, + "step": 4718 + }, + { + "epoch": 1.4323873121869783, + "grad_norm": 0.4710002839565277, + "learning_rate": 7.616179001721172e-05, + "loss": 1.8432, + "step": 4719 + }, + { + "epoch": 1.4326908483836698, + "grad_norm": 0.6048409938812256, + "learning_rate": 7.615672775134151e-05, + "loss": 1.8028, + "step": 4720 + }, + { + "epoch": 1.4329943845803612, + "grad_norm": 0.4102180600166321, + "learning_rate": 7.61516654854713e-05, + "loss": 1.3209, + "step": 4721 + }, + { + "epoch": 1.4332979207770526, + "grad_norm": 0.5769391655921936, + "learning_rate": 7.61466032196011e-05, + "loss": 1.4543, + "step": 4722 + }, + { + "epoch": 1.433601456973744, + "grad_norm": 0.5256406664848328, + "learning_rate": 7.61415409537309e-05, + "loss": 1.572, + "step": 4723 + }, + { + "epoch": 1.4339049931704355, + "grad_norm": 0.5865902900695801, + "learning_rate": 7.613647868786069e-05, + "loss": 1.3323, + "step": 4724 + }, + { + "epoch": 1.434208529367127, + "grad_norm": 0.5765762329101562, + "learning_rate": 7.613141642199048e-05, + "loss": 1.8572, + "step": 4725 + }, + { + "epoch": 1.4345120655638186, + "grad_norm": 0.6536111235618591, + "learning_rate": 7.612635415612028e-05, + "loss": 1.1194, + "step": 4726 + }, + { + "epoch": 1.4348156017605098, + "grad_norm": 0.559202253818512, + "learning_rate": 7.612129189025007e-05, + "loss": 1.9195, + "step": 4727 + }, + { + "epoch": 1.4351191379572015, + "grad_norm": 0.5304152369499207, + "learning_rate": 7.611622962437988e-05, + "loss": 1.4013, + "step": 4728 + }, + { + "epoch": 1.435422674153893, + "grad_norm": 0.5185491442680359, + "learning_rate": 7.611116735850968e-05, + "loss": 1.7297, + "step": 4729 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.5597679018974304, + "learning_rate": 7.610610509263947e-05, + "loss": 1.4894, + "step": 4730 + }, + { + "epoch": 1.4360297465472758, + "grad_norm": 0.47206321358680725, + "learning_rate": 7.610104282676927e-05, + "loss": 2.1203, + "step": 4731 + }, + { + "epoch": 1.4363332827439672, + "grad_norm": 0.536067545413971, + "learning_rate": 7.609598056089906e-05, + "loss": 1.4863, + "step": 4732 + }, + { + "epoch": 1.4366368189406586, + "grad_norm": 0.510307788848877, + "learning_rate": 7.609091829502886e-05, + "loss": 1.6554, + "step": 4733 + }, + { + "epoch": 1.43694035513735, + "grad_norm": 0.4985925853252411, + "learning_rate": 7.608585602915865e-05, + "loss": 1.5833, + "step": 4734 + }, + { + "epoch": 1.4372438913340415, + "grad_norm": 0.4628017544746399, + "learning_rate": 7.608079376328845e-05, + "loss": 1.7996, + "step": 4735 + }, + { + "epoch": 1.437547427530733, + "grad_norm": 0.5663459300994873, + "learning_rate": 7.607573149741824e-05, + "loss": 1.5257, + "step": 4736 + }, + { + "epoch": 1.4378509637274246, + "grad_norm": 0.4896049201488495, + "learning_rate": 7.607066923154804e-05, + "loss": 1.8763, + "step": 4737 + }, + { + "epoch": 1.4381544999241158, + "grad_norm": 0.4533321261405945, + "learning_rate": 7.606560696567784e-05, + "loss": 0.8658, + "step": 4738 + }, + { + "epoch": 1.4384580361208075, + "grad_norm": 0.5471289157867432, + "learning_rate": 7.606054469980764e-05, + "loss": 2.0191, + "step": 4739 + }, + { + "epoch": 1.438761572317499, + "grad_norm": 0.45516133308410645, + "learning_rate": 7.605548243393743e-05, + "loss": 1.3933, + "step": 4740 + }, + { + "epoch": 1.4390651085141903, + "grad_norm": 0.7363737225532532, + "learning_rate": 7.605042016806723e-05, + "loss": 1.7563, + "step": 4741 + }, + { + "epoch": 1.4393686447108818, + "grad_norm": 0.403385192155838, + "learning_rate": 7.604535790219702e-05, + "loss": 1.9145, + "step": 4742 + }, + { + "epoch": 1.4396721809075732, + "grad_norm": 0.502412736415863, + "learning_rate": 7.604029563632682e-05, + "loss": 1.6981, + "step": 4743 + }, + { + "epoch": 1.4399757171042646, + "grad_norm": 0.5202810168266296, + "learning_rate": 7.603523337045661e-05, + "loss": 1.7792, + "step": 4744 + }, + { + "epoch": 1.440279253300956, + "grad_norm": 0.5282444357872009, + "learning_rate": 7.603017110458641e-05, + "loss": 1.7323, + "step": 4745 + }, + { + "epoch": 1.4405827894976477, + "grad_norm": 0.4394991099834442, + "learning_rate": 7.60251088387162e-05, + "loss": 1.7267, + "step": 4746 + }, + { + "epoch": 1.440886325694339, + "grad_norm": 0.5699142217636108, + "learning_rate": 7.602004657284601e-05, + "loss": 1.7191, + "step": 4747 + }, + { + "epoch": 1.4411898618910306, + "grad_norm": 0.5591686964035034, + "learning_rate": 7.60149843069758e-05, + "loss": 1.6067, + "step": 4748 + }, + { + "epoch": 1.441493398087722, + "grad_norm": 0.4762207269668579, + "learning_rate": 7.600992204110561e-05, + "loss": 1.7094, + "step": 4749 + }, + { + "epoch": 1.4417969342844135, + "grad_norm": 0.5511293411254883, + "learning_rate": 7.600485977523541e-05, + "loss": 1.4719, + "step": 4750 + }, + { + "epoch": 1.442100470481105, + "grad_norm": 0.5466549396514893, + "learning_rate": 7.59997975093652e-05, + "loss": 1.8903, + "step": 4751 + }, + { + "epoch": 1.4424040066777963, + "grad_norm": 0.7153205871582031, + "learning_rate": 7.5994735243495e-05, + "loss": 1.6593, + "step": 4752 + }, + { + "epoch": 1.4427075428744878, + "grad_norm": 0.49872803688049316, + "learning_rate": 7.598967297762479e-05, + "loss": 1.3233, + "step": 4753 + }, + { + "epoch": 1.4430110790711792, + "grad_norm": 0.558476984500885, + "learning_rate": 7.598461071175459e-05, + "loss": 1.5951, + "step": 4754 + }, + { + "epoch": 1.4433146152678706, + "grad_norm": 0.4782353639602661, + "learning_rate": 7.597954844588438e-05, + "loss": 1.5372, + "step": 4755 + }, + { + "epoch": 1.443618151464562, + "grad_norm": 0.5572071075439453, + "learning_rate": 7.597448618001418e-05, + "loss": 1.4866, + "step": 4756 + }, + { + "epoch": 1.4439216876612537, + "grad_norm": 0.61551433801651, + "learning_rate": 7.596942391414397e-05, + "loss": 1.6909, + "step": 4757 + }, + { + "epoch": 1.444225223857945, + "grad_norm": 0.5780778527259827, + "learning_rate": 7.596436164827378e-05, + "loss": 1.4837, + "step": 4758 + }, + { + "epoch": 1.4445287600546366, + "grad_norm": 0.6437112092971802, + "learning_rate": 7.595929938240357e-05, + "loss": 2.1734, + "step": 4759 + }, + { + "epoch": 1.444832296251328, + "grad_norm": 0.5654916167259216, + "learning_rate": 7.595423711653337e-05, + "loss": 1.407, + "step": 4760 + }, + { + "epoch": 1.4451358324480195, + "grad_norm": 0.6159085631370544, + "learning_rate": 7.594917485066316e-05, + "loss": 1.6069, + "step": 4761 + }, + { + "epoch": 1.445439368644711, + "grad_norm": 0.5013946294784546, + "learning_rate": 7.594411258479296e-05, + "loss": 1.8218, + "step": 4762 + }, + { + "epoch": 1.4457429048414023, + "grad_norm": 0.49084192514419556, + "learning_rate": 7.593905031892275e-05, + "loss": 1.8749, + "step": 4763 + }, + { + "epoch": 1.4460464410380938, + "grad_norm": 0.4393550455570221, + "learning_rate": 7.593398805305255e-05, + "loss": 1.4319, + "step": 4764 + }, + { + "epoch": 1.4463499772347852, + "grad_norm": 0.5603815317153931, + "learning_rate": 7.592892578718234e-05, + "loss": 1.6359, + "step": 4765 + }, + { + "epoch": 1.4466535134314766, + "grad_norm": 0.5094273686408997, + "learning_rate": 7.592386352131214e-05, + "loss": 1.9181, + "step": 4766 + }, + { + "epoch": 1.446957049628168, + "grad_norm": 0.5389642119407654, + "learning_rate": 7.591880125544195e-05, + "loss": 1.888, + "step": 4767 + }, + { + "epoch": 1.4472605858248597, + "grad_norm": 0.5263598561286926, + "learning_rate": 7.591373898957174e-05, + "loss": 1.7503, + "step": 4768 + }, + { + "epoch": 1.447564122021551, + "grad_norm": 0.4412989318370819, + "learning_rate": 7.590867672370154e-05, + "loss": 1.2045, + "step": 4769 + }, + { + "epoch": 1.4478676582182426, + "grad_norm": 0.5222303867340088, + "learning_rate": 7.590361445783133e-05, + "loss": 1.8976, + "step": 4770 + }, + { + "epoch": 1.448171194414934, + "grad_norm": 0.49793916940689087, + "learning_rate": 7.589855219196113e-05, + "loss": 1.3931, + "step": 4771 + }, + { + "epoch": 1.4484747306116255, + "grad_norm": 0.5002750754356384, + "learning_rate": 7.589348992609092e-05, + "loss": 1.6249, + "step": 4772 + }, + { + "epoch": 1.448778266808317, + "grad_norm": 0.4378749132156372, + "learning_rate": 7.588842766022072e-05, + "loss": 1.6208, + "step": 4773 + }, + { + "epoch": 1.4490818030050083, + "grad_norm": 0.5322429537773132, + "learning_rate": 7.588336539435051e-05, + "loss": 1.6444, + "step": 4774 + }, + { + "epoch": 1.4493853392016998, + "grad_norm": 0.5761924982070923, + "learning_rate": 7.58783031284803e-05, + "loss": 1.6111, + "step": 4775 + }, + { + "epoch": 1.4496888753983912, + "grad_norm": 0.5562173128128052, + "learning_rate": 7.58732408626101e-05, + "loss": 1.9395, + "step": 4776 + }, + { + "epoch": 1.4499924115950826, + "grad_norm": 0.5333907008171082, + "learning_rate": 7.586817859673991e-05, + "loss": 1.8048, + "step": 4777 + }, + { + "epoch": 1.450295947791774, + "grad_norm": 0.5698195099830627, + "learning_rate": 7.58631163308697e-05, + "loss": 1.4219, + "step": 4778 + }, + { + "epoch": 1.4505994839884657, + "grad_norm": 0.4817031919956207, + "learning_rate": 7.58580540649995e-05, + "loss": 0.9775, + "step": 4779 + }, + { + "epoch": 1.4509030201851572, + "grad_norm": 0.527871310710907, + "learning_rate": 7.585299179912929e-05, + "loss": 1.8112, + "step": 4780 + }, + { + "epoch": 1.4512065563818486, + "grad_norm": 0.5405674576759338, + "learning_rate": 7.584792953325909e-05, + "loss": 1.8271, + "step": 4781 + }, + { + "epoch": 1.45151009257854, + "grad_norm": 0.5090677738189697, + "learning_rate": 7.584286726738888e-05, + "loss": 1.851, + "step": 4782 + }, + { + "epoch": 1.4518136287752315, + "grad_norm": 0.5070275068283081, + "learning_rate": 7.583780500151868e-05, + "loss": 1.4614, + "step": 4783 + }, + { + "epoch": 1.452117164971923, + "grad_norm": 0.5664345622062683, + "learning_rate": 7.583274273564847e-05, + "loss": 1.4379, + "step": 4784 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.7471550107002258, + "learning_rate": 7.582768046977827e-05, + "loss": 1.3397, + "step": 4785 + }, + { + "epoch": 1.4527242373653058, + "grad_norm": 0.4601489007472992, + "learning_rate": 7.582261820390808e-05, + "loss": 1.8286, + "step": 4786 + }, + { + "epoch": 1.4530277735619972, + "grad_norm": 0.5429388880729675, + "learning_rate": 7.581755593803787e-05, + "loss": 1.5062, + "step": 4787 + }, + { + "epoch": 1.4533313097586888, + "grad_norm": 0.5463858842849731, + "learning_rate": 7.581249367216766e-05, + "loss": 1.8726, + "step": 4788 + }, + { + "epoch": 1.45363484595538, + "grad_norm": 0.4973449110984802, + "learning_rate": 7.580743140629746e-05, + "loss": 1.7763, + "step": 4789 + }, + { + "epoch": 1.4539383821520717, + "grad_norm": 0.7041242122650146, + "learning_rate": 7.580236914042725e-05, + "loss": 1.91, + "step": 4790 + }, + { + "epoch": 1.4542419183487632, + "grad_norm": 0.7030282020568848, + "learning_rate": 7.579730687455705e-05, + "loss": 1.5485, + "step": 4791 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.52986079454422, + "learning_rate": 7.579224460868684e-05, + "loss": 1.9781, + "step": 4792 + }, + { + "epoch": 1.454848990742146, + "grad_norm": 0.4283183515071869, + "learning_rate": 7.578718234281665e-05, + "loss": 1.8996, + "step": 4793 + }, + { + "epoch": 1.4551525269388375, + "grad_norm": 0.5609734654426575, + "learning_rate": 7.578212007694645e-05, + "loss": 1.5946, + "step": 4794 + }, + { + "epoch": 1.4554560631355289, + "grad_norm": 0.46615251898765564, + "learning_rate": 7.577705781107624e-05, + "loss": 1.7604, + "step": 4795 + }, + { + "epoch": 1.4557595993322203, + "grad_norm": 0.5443726778030396, + "learning_rate": 7.577199554520604e-05, + "loss": 1.5563, + "step": 4796 + }, + { + "epoch": 1.4560631355289118, + "grad_norm": 0.5708737969398499, + "learning_rate": 7.576693327933584e-05, + "loss": 1.4693, + "step": 4797 + }, + { + "epoch": 1.4563666717256032, + "grad_norm": 0.5923335552215576, + "learning_rate": 7.576187101346564e-05, + "loss": 1.915, + "step": 4798 + }, + { + "epoch": 1.4566702079222948, + "grad_norm": 0.5595468282699585, + "learning_rate": 7.575680874759543e-05, + "loss": 1.4823, + "step": 4799 + }, + { + "epoch": 1.456973744118986, + "grad_norm": 0.583549439907074, + "learning_rate": 7.575174648172523e-05, + "loss": 1.9422, + "step": 4800 + }, + { + "epoch": 1.4572772803156777, + "grad_norm": 0.46514013409614563, + "learning_rate": 7.574668421585502e-05, + "loss": 2.0324, + "step": 4801 + }, + { + "epoch": 1.4575808165123691, + "grad_norm": 0.46036651730537415, + "learning_rate": 7.574162194998482e-05, + "loss": 1.0407, + "step": 4802 + }, + { + "epoch": 1.4578843527090606, + "grad_norm": 0.5273803472518921, + "learning_rate": 7.573655968411461e-05, + "loss": 1.4721, + "step": 4803 + }, + { + "epoch": 1.458187888905752, + "grad_norm": 0.39598920941352844, + "learning_rate": 7.573149741824441e-05, + "loss": 1.2043, + "step": 4804 + }, + { + "epoch": 1.4584914251024435, + "grad_norm": 0.5395318865776062, + "learning_rate": 7.57264351523742e-05, + "loss": 1.5629, + "step": 4805 + }, + { + "epoch": 1.4587949612991349, + "grad_norm": 0.42741745710372925, + "learning_rate": 7.572137288650401e-05, + "loss": 1.7217, + "step": 4806 + }, + { + "epoch": 1.4590984974958263, + "grad_norm": 0.5320115685462952, + "learning_rate": 7.57163106206338e-05, + "loss": 1.7133, + "step": 4807 + }, + { + "epoch": 1.4594020336925178, + "grad_norm": 0.5228841304779053, + "learning_rate": 7.57112483547636e-05, + "loss": 1.3938, + "step": 4808 + }, + { + "epoch": 1.4597055698892092, + "grad_norm": 0.4937410354614258, + "learning_rate": 7.57061860888934e-05, + "loss": 1.4786, + "step": 4809 + }, + { + "epoch": 1.4600091060859008, + "grad_norm": 0.46726706624031067, + "learning_rate": 7.570112382302319e-05, + "loss": 1.8383, + "step": 4810 + }, + { + "epoch": 1.4603126422825923, + "grad_norm": 0.5124707221984863, + "learning_rate": 7.569606155715299e-05, + "loss": 1.5768, + "step": 4811 + }, + { + "epoch": 1.4606161784792837, + "grad_norm": 0.5231637954711914, + "learning_rate": 7.569099929128278e-05, + "loss": 1.8708, + "step": 4812 + }, + { + "epoch": 1.4609197146759751, + "grad_norm": 0.5755016803741455, + "learning_rate": 7.568593702541258e-05, + "loss": 1.2787, + "step": 4813 + }, + { + "epoch": 1.4612232508726666, + "grad_norm": 0.550441563129425, + "learning_rate": 7.568087475954237e-05, + "loss": 1.6205, + "step": 4814 + }, + { + "epoch": 1.461526787069358, + "grad_norm": 0.5654580593109131, + "learning_rate": 7.567581249367216e-05, + "loss": 1.4837, + "step": 4815 + }, + { + "epoch": 1.4618303232660494, + "grad_norm": 0.4842512905597687, + "learning_rate": 7.567075022780197e-05, + "loss": 1.9908, + "step": 4816 + }, + { + "epoch": 1.4621338594627409, + "grad_norm": 0.5777815580368042, + "learning_rate": 7.566568796193177e-05, + "loss": 1.7202, + "step": 4817 + }, + { + "epoch": 1.4624373956594323, + "grad_norm": 0.5063660144805908, + "learning_rate": 7.566062569606156e-05, + "loss": 1.5393, + "step": 4818 + }, + { + "epoch": 1.462740931856124, + "grad_norm": 0.4989129602909088, + "learning_rate": 7.565556343019136e-05, + "loss": 1.6849, + "step": 4819 + }, + { + "epoch": 1.4630444680528152, + "grad_norm": 0.42997097969055176, + "learning_rate": 7.565050116432115e-05, + "loss": 1.6785, + "step": 4820 + }, + { + "epoch": 1.4633480042495068, + "grad_norm": 0.4580230414867401, + "learning_rate": 7.564543889845095e-05, + "loss": 1.9124, + "step": 4821 + }, + { + "epoch": 1.4636515404461983, + "grad_norm": 0.44534167647361755, + "learning_rate": 7.564037663258074e-05, + "loss": 1.6729, + "step": 4822 + }, + { + "epoch": 1.4639550766428897, + "grad_norm": 0.5121662616729736, + "learning_rate": 7.563531436671054e-05, + "loss": 1.8268, + "step": 4823 + }, + { + "epoch": 1.4642586128395811, + "grad_norm": 0.5929915904998779, + "learning_rate": 7.563025210084033e-05, + "loss": 1.4041, + "step": 4824 + }, + { + "epoch": 1.4645621490362726, + "grad_norm": 0.6041303277015686, + "learning_rate": 7.562518983497014e-05, + "loss": 1.5088, + "step": 4825 + }, + { + "epoch": 1.464865685232964, + "grad_norm": 0.6024086475372314, + "learning_rate": 7.562012756909993e-05, + "loss": 1.7524, + "step": 4826 + }, + { + "epoch": 1.4651692214296554, + "grad_norm": 0.5736476182937622, + "learning_rate": 7.561506530322973e-05, + "loss": 1.6131, + "step": 4827 + }, + { + "epoch": 1.4654727576263469, + "grad_norm": 0.5395902991294861, + "learning_rate": 7.561000303735952e-05, + "loss": 1.4278, + "step": 4828 + }, + { + "epoch": 1.4657762938230383, + "grad_norm": 0.7780386209487915, + "learning_rate": 7.560494077148932e-05, + "loss": 1.3143, + "step": 4829 + }, + { + "epoch": 1.46607983001973, + "grad_norm": 0.46117058396339417, + "learning_rate": 7.559987850561911e-05, + "loss": 1.6481, + "step": 4830 + }, + { + "epoch": 1.4663833662164212, + "grad_norm": 0.7055519223213196, + "learning_rate": 7.559481623974891e-05, + "loss": 1.7452, + "step": 4831 + }, + { + "epoch": 1.4666869024131128, + "grad_norm": 0.5338720679283142, + "learning_rate": 7.55897539738787e-05, + "loss": 1.7455, + "step": 4832 + }, + { + "epoch": 1.4669904386098043, + "grad_norm": 0.6645461916923523, + "learning_rate": 7.55846917080085e-05, + "loss": 1.6314, + "step": 4833 + }, + { + "epoch": 1.4672939748064957, + "grad_norm": 0.561160683631897, + "learning_rate": 7.55796294421383e-05, + "loss": 1.3721, + "step": 4834 + }, + { + "epoch": 1.4675975110031871, + "grad_norm": 0.4907180964946747, + "learning_rate": 7.55745671762681e-05, + "loss": 1.6024, + "step": 4835 + }, + { + "epoch": 1.4679010471998786, + "grad_norm": 0.6963015794754028, + "learning_rate": 7.55695049103979e-05, + "loss": 1.8332, + "step": 4836 + }, + { + "epoch": 1.46820458339657, + "grad_norm": 0.5751698017120361, + "learning_rate": 7.556444264452769e-05, + "loss": 1.7317, + "step": 4837 + }, + { + "epoch": 1.4685081195932614, + "grad_norm": 0.5866538882255554, + "learning_rate": 7.55593803786575e-05, + "loss": 1.6873, + "step": 4838 + }, + { + "epoch": 1.4688116557899529, + "grad_norm": 0.5480346083641052, + "learning_rate": 7.55543181127873e-05, + "loss": 1.8819, + "step": 4839 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.5058147311210632, + "learning_rate": 7.554925584691709e-05, + "loss": 1.4353, + "step": 4840 + }, + { + "epoch": 1.469418728183336, + "grad_norm": 0.5135140419006348, + "learning_rate": 7.554419358104688e-05, + "loss": 1.5989, + "step": 4841 + }, + { + "epoch": 1.4697222643800274, + "grad_norm": 0.5217494964599609, + "learning_rate": 7.553913131517668e-05, + "loss": 1.7296, + "step": 4842 + }, + { + "epoch": 1.4700258005767188, + "grad_norm": 0.5667610168457031, + "learning_rate": 7.553406904930647e-05, + "loss": 1.5823, + "step": 4843 + }, + { + "epoch": 1.4703293367734103, + "grad_norm": 0.9399384260177612, + "learning_rate": 7.552900678343627e-05, + "loss": 1.3248, + "step": 4844 + }, + { + "epoch": 1.4706328729701017, + "grad_norm": 0.7394365668296814, + "learning_rate": 7.552394451756608e-05, + "loss": 1.6822, + "step": 4845 + }, + { + "epoch": 1.4709364091667931, + "grad_norm": 0.532162070274353, + "learning_rate": 7.551888225169587e-05, + "loss": 1.4122, + "step": 4846 + }, + { + "epoch": 1.4712399453634846, + "grad_norm": 0.5215944647789001, + "learning_rate": 7.551381998582567e-05, + "loss": 1.6496, + "step": 4847 + }, + { + "epoch": 1.471543481560176, + "grad_norm": 0.6880224943161011, + "learning_rate": 7.550875771995546e-05, + "loss": 1.4341, + "step": 4848 + }, + { + "epoch": 1.4718470177568674, + "grad_norm": 0.5888542532920837, + "learning_rate": 7.550369545408526e-05, + "loss": 1.2608, + "step": 4849 + }, + { + "epoch": 1.472150553953559, + "grad_norm": 0.5835822224617004, + "learning_rate": 7.549863318821505e-05, + "loss": 1.7639, + "step": 4850 + }, + { + "epoch": 1.4724540901502503, + "grad_norm": 0.5503977537155151, + "learning_rate": 7.549357092234485e-05, + "loss": 1.5515, + "step": 4851 + }, + { + "epoch": 1.472757626346942, + "grad_norm": 0.548978865146637, + "learning_rate": 7.548850865647464e-05, + "loss": 1.8194, + "step": 4852 + }, + { + "epoch": 1.4730611625436334, + "grad_norm": 0.5089396238327026, + "learning_rate": 7.548344639060443e-05, + "loss": 1.4529, + "step": 4853 + }, + { + "epoch": 1.4733646987403248, + "grad_norm": 0.5043666362762451, + "learning_rate": 7.547838412473423e-05, + "loss": 1.2321, + "step": 4854 + }, + { + "epoch": 1.4736682349370163, + "grad_norm": 0.5701262354850769, + "learning_rate": 7.547332185886404e-05, + "loss": 1.4486, + "step": 4855 + }, + { + "epoch": 1.4739717711337077, + "grad_norm": 0.6709611415863037, + "learning_rate": 7.546825959299383e-05, + "loss": 1.8892, + "step": 4856 + }, + { + "epoch": 1.4742753073303991, + "grad_norm": 0.6108323931694031, + "learning_rate": 7.546319732712363e-05, + "loss": 2.0002, + "step": 4857 + }, + { + "epoch": 1.4745788435270906, + "grad_norm": 0.5647789835929871, + "learning_rate": 7.545813506125342e-05, + "loss": 1.6422, + "step": 4858 + }, + { + "epoch": 1.474882379723782, + "grad_norm": 1.221641182899475, + "learning_rate": 7.545307279538322e-05, + "loss": 1.7342, + "step": 4859 + }, + { + "epoch": 1.4751859159204734, + "grad_norm": 0.6236290335655212, + "learning_rate": 7.544801052951301e-05, + "loss": 1.5418, + "step": 4860 + }, + { + "epoch": 1.475489452117165, + "grad_norm": 0.617218017578125, + "learning_rate": 7.54429482636428e-05, + "loss": 1.1776, + "step": 4861 + }, + { + "epoch": 1.4757929883138563, + "grad_norm": 0.559785783290863, + "learning_rate": 7.54378859977726e-05, + "loss": 1.3574, + "step": 4862 + }, + { + "epoch": 1.476096524510548, + "grad_norm": 0.5048946738243103, + "learning_rate": 7.54328237319024e-05, + "loss": 1.3648, + "step": 4863 + }, + { + "epoch": 1.4764000607072394, + "grad_norm": 0.5944718718528748, + "learning_rate": 7.54277614660322e-05, + "loss": 1.8113, + "step": 4864 + }, + { + "epoch": 1.4767035969039308, + "grad_norm": 0.4631864130496979, + "learning_rate": 7.5422699200162e-05, + "loss": 1.2327, + "step": 4865 + }, + { + "epoch": 1.4770071331006223, + "grad_norm": 0.5952193737030029, + "learning_rate": 7.54176369342918e-05, + "loss": 1.5327, + "step": 4866 + }, + { + "epoch": 1.4773106692973137, + "grad_norm": 0.46949222683906555, + "learning_rate": 7.541257466842159e-05, + "loss": 1.2225, + "step": 4867 + }, + { + "epoch": 1.4776142054940051, + "grad_norm": 0.6023997664451599, + "learning_rate": 7.540751240255138e-05, + "loss": 1.8067, + "step": 4868 + }, + { + "epoch": 1.4779177416906966, + "grad_norm": 0.5851592421531677, + "learning_rate": 7.540245013668118e-05, + "loss": 1.0784, + "step": 4869 + }, + { + "epoch": 1.478221277887388, + "grad_norm": 0.5134581923484802, + "learning_rate": 7.539738787081097e-05, + "loss": 1.9496, + "step": 4870 + }, + { + "epoch": 1.4785248140840794, + "grad_norm": 0.7432266473770142, + "learning_rate": 7.539232560494077e-05, + "loss": 1.6646, + "step": 4871 + }, + { + "epoch": 1.478828350280771, + "grad_norm": 0.47093117237091064, + "learning_rate": 7.538726333907056e-05, + "loss": 1.4715, + "step": 4872 + }, + { + "epoch": 1.4791318864774623, + "grad_norm": 0.5831784605979919, + "learning_rate": 7.538220107320037e-05, + "loss": 1.4022, + "step": 4873 + }, + { + "epoch": 1.479435422674154, + "grad_norm": 0.4904673397541046, + "learning_rate": 7.537713880733017e-05, + "loss": 1.4029, + "step": 4874 + }, + { + "epoch": 1.4797389588708454, + "grad_norm": 0.7261788249015808, + "learning_rate": 7.537207654145996e-05, + "loss": 1.5217, + "step": 4875 + }, + { + "epoch": 1.4800424950675368, + "grad_norm": 0.46538203954696655, + "learning_rate": 7.536701427558976e-05, + "loss": 1.8431, + "step": 4876 + }, + { + "epoch": 1.4803460312642283, + "grad_norm": 0.6086418628692627, + "learning_rate": 7.536195200971955e-05, + "loss": 2.0019, + "step": 4877 + }, + { + "epoch": 1.4806495674609197, + "grad_norm": 0.5402958393096924, + "learning_rate": 7.535688974384935e-05, + "loss": 1.6871, + "step": 4878 + }, + { + "epoch": 1.4809531036576111, + "grad_norm": 0.5592092871665955, + "learning_rate": 7.535182747797914e-05, + "loss": 1.958, + "step": 4879 + }, + { + "epoch": 1.4812566398543026, + "grad_norm": 0.6036184430122375, + "learning_rate": 7.534676521210893e-05, + "loss": 1.6384, + "step": 4880 + }, + { + "epoch": 1.4815601760509942, + "grad_norm": 0.4480501115322113, + "learning_rate": 7.534170294623873e-05, + "loss": 1.7879, + "step": 4881 + }, + { + "epoch": 1.4818637122476854, + "grad_norm": 0.5329246520996094, + "learning_rate": 7.533664068036854e-05, + "loss": 1.4749, + "step": 4882 + }, + { + "epoch": 1.482167248444377, + "grad_norm": 0.5111972689628601, + "learning_rate": 7.533157841449833e-05, + "loss": 2.0178, + "step": 4883 + }, + { + "epoch": 1.4824707846410685, + "grad_norm": 0.47190091013908386, + "learning_rate": 7.532651614862814e-05, + "loss": 1.4551, + "step": 4884 + }, + { + "epoch": 1.48277432083776, + "grad_norm": 0.4564230442047119, + "learning_rate": 7.532145388275794e-05, + "loss": 1.4727, + "step": 4885 + }, + { + "epoch": 1.4830778570344514, + "grad_norm": 0.550284743309021, + "learning_rate": 7.531639161688773e-05, + "loss": 1.7774, + "step": 4886 + }, + { + "epoch": 1.4833813932311428, + "grad_norm": 0.9376974701881409, + "learning_rate": 7.531132935101753e-05, + "loss": 1.6029, + "step": 4887 + }, + { + "epoch": 1.4836849294278343, + "grad_norm": 0.48898518085479736, + "learning_rate": 7.530626708514732e-05, + "loss": 1.7904, + "step": 4888 + }, + { + "epoch": 1.4839884656245257, + "grad_norm": 0.5602661967277527, + "learning_rate": 7.530120481927712e-05, + "loss": 1.6557, + "step": 4889 + }, + { + "epoch": 1.4842920018212171, + "grad_norm": 0.5517251491546631, + "learning_rate": 7.529614255340691e-05, + "loss": 1.7286, + "step": 4890 + }, + { + "epoch": 1.4845955380179086, + "grad_norm": 0.6002934575080872, + "learning_rate": 7.52910802875367e-05, + "loss": 1.2734, + "step": 4891 + }, + { + "epoch": 1.4848990742146002, + "grad_norm": 0.502790629863739, + "learning_rate": 7.52860180216665e-05, + "loss": 1.6792, + "step": 4892 + }, + { + "epoch": 1.4852026104112914, + "grad_norm": 0.5758108496665955, + "learning_rate": 7.52809557557963e-05, + "loss": 1.6734, + "step": 4893 + }, + { + "epoch": 1.485506146607983, + "grad_norm": 0.5325357913970947, + "learning_rate": 7.52758934899261e-05, + "loss": 1.7765, + "step": 4894 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.6031767725944519, + "learning_rate": 7.52708312240559e-05, + "loss": 0.881, + "step": 4895 + }, + { + "epoch": 1.486113219001366, + "grad_norm": 0.5476002097129822, + "learning_rate": 7.526576895818569e-05, + "loss": 1.7584, + "step": 4896 + }, + { + "epoch": 1.4864167551980574, + "grad_norm": 0.6910513043403625, + "learning_rate": 7.526070669231549e-05, + "loss": 1.9068, + "step": 4897 + }, + { + "epoch": 1.4867202913947488, + "grad_norm": 0.5156431198120117, + "learning_rate": 7.525564442644528e-05, + "loss": 1.7052, + "step": 4898 + }, + { + "epoch": 1.4870238275914403, + "grad_norm": 0.9591718316078186, + "learning_rate": 7.525058216057508e-05, + "loss": 1.5279, + "step": 4899 + }, + { + "epoch": 1.4873273637881317, + "grad_norm": 0.4468207061290741, + "learning_rate": 7.524551989470487e-05, + "loss": 1.7925, + "step": 4900 + }, + { + "epoch": 1.4876308999848231, + "grad_norm": 0.5419300198554993, + "learning_rate": 7.524045762883467e-05, + "loss": 1.306, + "step": 4901 + }, + { + "epoch": 1.4879344361815146, + "grad_norm": 0.47516146302223206, + "learning_rate": 7.523539536296446e-05, + "loss": 1.7452, + "step": 4902 + }, + { + "epoch": 1.4882379723782062, + "grad_norm": 0.4809563457965851, + "learning_rate": 7.523033309709427e-05, + "loss": 1.6979, + "step": 4903 + }, + { + "epoch": 1.4885415085748974, + "grad_norm": 0.5383793711662292, + "learning_rate": 7.522527083122406e-05, + "loss": 1.7181, + "step": 4904 + }, + { + "epoch": 1.488845044771589, + "grad_norm": 0.5974624156951904, + "learning_rate": 7.522020856535386e-05, + "loss": 1.3829, + "step": 4905 + }, + { + "epoch": 1.4891485809682805, + "grad_norm": 0.5320436358451843, + "learning_rate": 7.521514629948365e-05, + "loss": 1.6578, + "step": 4906 + }, + { + "epoch": 1.489452117164972, + "grad_norm": 0.49754762649536133, + "learning_rate": 7.521008403361345e-05, + "loss": 1.7021, + "step": 4907 + }, + { + "epoch": 1.4897556533616634, + "grad_norm": 0.559622585773468, + "learning_rate": 7.520502176774324e-05, + "loss": 1.7037, + "step": 4908 + }, + { + "epoch": 1.4900591895583548, + "grad_norm": 0.5277832746505737, + "learning_rate": 7.519995950187304e-05, + "loss": 0.8649, + "step": 4909 + }, + { + "epoch": 1.4903627257550462, + "grad_norm": 0.751785159111023, + "learning_rate": 7.519489723600283e-05, + "loss": 0.7861, + "step": 4910 + }, + { + "epoch": 1.4906662619517377, + "grad_norm": 0.5476072430610657, + "learning_rate": 7.518983497013263e-05, + "loss": 1.7936, + "step": 4911 + }, + { + "epoch": 1.4909697981484293, + "grad_norm": 0.5985202789306641, + "learning_rate": 7.518477270426244e-05, + "loss": 1.6177, + "step": 4912 + }, + { + "epoch": 1.4912733343451205, + "grad_norm": 0.5717979073524475, + "learning_rate": 7.517971043839223e-05, + "loss": 1.7372, + "step": 4913 + }, + { + "epoch": 1.4915768705418122, + "grad_norm": 0.620720386505127, + "learning_rate": 7.517464817252203e-05, + "loss": 2.0156, + "step": 4914 + }, + { + "epoch": 1.4918804067385036, + "grad_norm": 0.5804673433303833, + "learning_rate": 7.516958590665182e-05, + "loss": 1.521, + "step": 4915 + }, + { + "epoch": 1.492183942935195, + "grad_norm": 0.7272177338600159, + "learning_rate": 7.516452364078162e-05, + "loss": 1.5384, + "step": 4916 + }, + { + "epoch": 1.4924874791318865, + "grad_norm": 0.6218146085739136, + "learning_rate": 7.515946137491141e-05, + "loss": 1.6263, + "step": 4917 + }, + { + "epoch": 1.492791015328578, + "grad_norm": 0.6515426635742188, + "learning_rate": 7.51543991090412e-05, + "loss": 1.1875, + "step": 4918 + }, + { + "epoch": 1.4930945515252694, + "grad_norm": 0.6261817812919617, + "learning_rate": 7.5149336843171e-05, + "loss": 1.7554, + "step": 4919 + }, + { + "epoch": 1.4933980877219608, + "grad_norm": 0.5548028945922852, + "learning_rate": 7.51442745773008e-05, + "loss": 1.5579, + "step": 4920 + }, + { + "epoch": 1.4937016239186522, + "grad_norm": 0.4987451136112213, + "learning_rate": 7.513921231143059e-05, + "loss": 1.7415, + "step": 4921 + }, + { + "epoch": 1.4940051601153437, + "grad_norm": 0.3654840290546417, + "learning_rate": 7.51341500455604e-05, + "loss": 0.965, + "step": 4922 + }, + { + "epoch": 1.4943086963120353, + "grad_norm": 0.6118152141571045, + "learning_rate": 7.512908777969019e-05, + "loss": 1.4371, + "step": 4923 + }, + { + "epoch": 1.4946122325087265, + "grad_norm": 0.5052013993263245, + "learning_rate": 7.512402551381999e-05, + "loss": 1.5476, + "step": 4924 + }, + { + "epoch": 1.4949157687054182, + "grad_norm": 0.5931279063224792, + "learning_rate": 7.511896324794978e-05, + "loss": 1.8175, + "step": 4925 + }, + { + "epoch": 1.4952193049021096, + "grad_norm": 1.926950454711914, + "learning_rate": 7.511390098207958e-05, + "loss": 1.6452, + "step": 4926 + }, + { + "epoch": 1.495522841098801, + "grad_norm": 0.5131598711013794, + "learning_rate": 7.510883871620939e-05, + "loss": 1.7048, + "step": 4927 + }, + { + "epoch": 1.4958263772954925, + "grad_norm": 0.6365157961845398, + "learning_rate": 7.510377645033918e-05, + "loss": 1.4151, + "step": 4928 + }, + { + "epoch": 1.496129913492184, + "grad_norm": 0.48523157835006714, + "learning_rate": 7.509871418446897e-05, + "loss": 1.9676, + "step": 4929 + }, + { + "epoch": 1.4964334496888754, + "grad_norm": 0.4741292893886566, + "learning_rate": 7.509365191859877e-05, + "loss": 1.9867, + "step": 4930 + }, + { + "epoch": 1.4967369858855668, + "grad_norm": 0.6503263115882874, + "learning_rate": 7.508858965272856e-05, + "loss": 1.6746, + "step": 4931 + }, + { + "epoch": 1.4970405220822582, + "grad_norm": 0.5478717684745789, + "learning_rate": 7.508352738685836e-05, + "loss": 1.711, + "step": 4932 + }, + { + "epoch": 1.4973440582789497, + "grad_norm": 0.5373908877372742, + "learning_rate": 7.507846512098817e-05, + "loss": 1.7798, + "step": 4933 + }, + { + "epoch": 1.4976475944756413, + "grad_norm": 0.559581995010376, + "learning_rate": 7.507340285511796e-05, + "loss": 1.351, + "step": 4934 + }, + { + "epoch": 1.4979511306723325, + "grad_norm": 0.5667752027511597, + "learning_rate": 7.506834058924776e-05, + "loss": 1.3402, + "step": 4935 + }, + { + "epoch": 1.4982546668690242, + "grad_norm": 0.4814712703227997, + "learning_rate": 7.506327832337755e-05, + "loss": 1.7732, + "step": 4936 + }, + { + "epoch": 1.4985582030657156, + "grad_norm": 0.5498847365379333, + "learning_rate": 7.505821605750735e-05, + "loss": 1.7493, + "step": 4937 + }, + { + "epoch": 1.498861739262407, + "grad_norm": 0.4908760190010071, + "learning_rate": 7.505315379163714e-05, + "loss": 1.8062, + "step": 4938 + }, + { + "epoch": 1.4991652754590985, + "grad_norm": 0.4848566949367523, + "learning_rate": 7.504809152576694e-05, + "loss": 1.9671, + "step": 4939 + }, + { + "epoch": 1.49946881165579, + "grad_norm": 0.49739202857017517, + "learning_rate": 7.504302925989673e-05, + "loss": 1.3136, + "step": 4940 + }, + { + "epoch": 1.4997723478524814, + "grad_norm": 0.5910068154335022, + "learning_rate": 7.503796699402653e-05, + "loss": 1.7624, + "step": 4941 + }, + { + "epoch": 1.5000758840491728, + "grad_norm": 0.4773714542388916, + "learning_rate": 7.503290472815633e-05, + "loss": 1.6358, + "step": 4942 + }, + { + "epoch": 1.5003794202458645, + "grad_norm": 0.5713791847229004, + "learning_rate": 7.502784246228613e-05, + "loss": 1.8797, + "step": 4943 + }, + { + "epoch": 1.5006829564425557, + "grad_norm": 0.5687196850776672, + "learning_rate": 7.502278019641592e-05, + "loss": 1.8505, + "step": 4944 + }, + { + "epoch": 1.5009864926392473, + "grad_norm": 0.5714086294174194, + "learning_rate": 7.501771793054572e-05, + "loss": 1.0782, + "step": 4945 + }, + { + "epoch": 1.5012900288359385, + "grad_norm": 0.5151203870773315, + "learning_rate": 7.501265566467551e-05, + "loss": 1.8271, + "step": 4946 + }, + { + "epoch": 1.5015935650326302, + "grad_norm": 0.7299343943595886, + "learning_rate": 7.500759339880531e-05, + "loss": 1.5812, + "step": 4947 + }, + { + "epoch": 1.5018971012293216, + "grad_norm": 1.2121248245239258, + "learning_rate": 7.50025311329351e-05, + "loss": 1.4983, + "step": 4948 + }, + { + "epoch": 1.502200637426013, + "grad_norm": 0.5982694029808044, + "learning_rate": 7.49974688670649e-05, + "loss": 1.2174, + "step": 4949 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.5536935329437256, + "learning_rate": 7.499240660119469e-05, + "loss": 1.496, + "step": 4950 + }, + { + "epoch": 1.502807709819396, + "grad_norm": 0.579058825969696, + "learning_rate": 7.49873443353245e-05, + "loss": 1.6991, + "step": 4951 + }, + { + "epoch": 1.5031112460160874, + "grad_norm": 0.5158204436302185, + "learning_rate": 7.49822820694543e-05, + "loss": 1.6041, + "step": 4952 + }, + { + "epoch": 1.5034147822127788, + "grad_norm": 0.5574657917022705, + "learning_rate": 7.497721980358409e-05, + "loss": 0.996, + "step": 4953 + }, + { + "epoch": 1.5037183184094705, + "grad_norm": 0.5834664106369019, + "learning_rate": 7.497215753771389e-05, + "loss": 1.675, + "step": 4954 + }, + { + "epoch": 1.5040218546061617, + "grad_norm": 0.5058130025863647, + "learning_rate": 7.496709527184368e-05, + "loss": 1.8119, + "step": 4955 + }, + { + "epoch": 1.5043253908028533, + "grad_norm": 0.6237099170684814, + "learning_rate": 7.496203300597347e-05, + "loss": 1.7406, + "step": 4956 + }, + { + "epoch": 1.5046289269995445, + "grad_norm": 0.579137921333313, + "learning_rate": 7.495697074010327e-05, + "loss": 1.7724, + "step": 4957 + }, + { + "epoch": 1.5049324631962362, + "grad_norm": 0.6171398162841797, + "learning_rate": 7.495190847423306e-05, + "loss": 1.6596, + "step": 4958 + }, + { + "epoch": 1.5052359993929276, + "grad_norm": 0.5880246162414551, + "learning_rate": 7.494684620836286e-05, + "loss": 1.5784, + "step": 4959 + }, + { + "epoch": 1.505539535589619, + "grad_norm": 0.48399999737739563, + "learning_rate": 7.494178394249265e-05, + "loss": 0.9879, + "step": 4960 + }, + { + "epoch": 1.5058430717863105, + "grad_norm": 0.6185703277587891, + "learning_rate": 7.493672167662246e-05, + "loss": 1.5767, + "step": 4961 + }, + { + "epoch": 1.506146607983002, + "grad_norm": 0.5096151232719421, + "learning_rate": 7.493165941075226e-05, + "loss": 1.5414, + "step": 4962 + }, + { + "epoch": 1.5064501441796936, + "grad_norm": 0.5900314450263977, + "learning_rate": 7.492659714488205e-05, + "loss": 1.9153, + "step": 4963 + }, + { + "epoch": 1.5067536803763848, + "grad_norm": 0.6663991808891296, + "learning_rate": 7.492153487901185e-05, + "loss": 1.6301, + "step": 4964 + }, + { + "epoch": 1.5070572165730765, + "grad_norm": 0.6082780957221985, + "learning_rate": 7.491647261314164e-05, + "loss": 1.9522, + "step": 4965 + }, + { + "epoch": 1.5073607527697677, + "grad_norm": 0.5442984700202942, + "learning_rate": 7.491141034727144e-05, + "loss": 0.9322, + "step": 4966 + }, + { + "epoch": 1.5076642889664593, + "grad_norm": 0.5286507606506348, + "learning_rate": 7.490634808140123e-05, + "loss": 1.6876, + "step": 4967 + }, + { + "epoch": 1.5079678251631508, + "grad_norm": 0.5359693169593811, + "learning_rate": 7.490128581553103e-05, + "loss": 1.4959, + "step": 4968 + }, + { + "epoch": 1.5082713613598422, + "grad_norm": 0.5242936611175537, + "learning_rate": 7.489622354966082e-05, + "loss": 1.9794, + "step": 4969 + }, + { + "epoch": 1.5085748975565336, + "grad_norm": 0.42484426498413086, + "learning_rate": 7.489116128379063e-05, + "loss": 1.7186, + "step": 4970 + }, + { + "epoch": 1.508878433753225, + "grad_norm": 0.8689149618148804, + "learning_rate": 7.488609901792042e-05, + "loss": 1.6143, + "step": 4971 + }, + { + "epoch": 1.5091819699499165, + "grad_norm": 0.49162110686302185, + "learning_rate": 7.488103675205023e-05, + "loss": 1.7344, + "step": 4972 + }, + { + "epoch": 1.509485506146608, + "grad_norm": 0.541732132434845, + "learning_rate": 7.487597448618003e-05, + "loss": 1.5149, + "step": 4973 + }, + { + "epoch": 1.5097890423432996, + "grad_norm": 0.542533278465271, + "learning_rate": 7.487091222030982e-05, + "loss": 1.8941, + "step": 4974 + }, + { + "epoch": 1.5100925785399908, + "grad_norm": 0.500696063041687, + "learning_rate": 7.486584995443962e-05, + "loss": 1.7141, + "step": 4975 + }, + { + "epoch": 1.5103961147366824, + "grad_norm": 0.5934485197067261, + "learning_rate": 7.486078768856941e-05, + "loss": 1.4603, + "step": 4976 + }, + { + "epoch": 1.5106996509333737, + "grad_norm": 1.424613356590271, + "learning_rate": 7.48557254226992e-05, + "loss": 1.6062, + "step": 4977 + }, + { + "epoch": 1.5110031871300653, + "grad_norm": 0.48682644963264465, + "learning_rate": 7.4850663156829e-05, + "loss": 1.7172, + "step": 4978 + }, + { + "epoch": 1.5113067233267568, + "grad_norm": 0.4934718608856201, + "learning_rate": 7.48456008909588e-05, + "loss": 1.7463, + "step": 4979 + }, + { + "epoch": 1.5116102595234482, + "grad_norm": 0.5626336336135864, + "learning_rate": 7.484053862508859e-05, + "loss": 1.5496, + "step": 4980 + }, + { + "epoch": 1.5119137957201396, + "grad_norm": 0.4722979962825775, + "learning_rate": 7.48354763592184e-05, + "loss": 1.6378, + "step": 4981 + }, + { + "epoch": 1.512217331916831, + "grad_norm": 0.5361006855964661, + "learning_rate": 7.48304140933482e-05, + "loss": 1.5269, + "step": 4982 + }, + { + "epoch": 1.5125208681135225, + "grad_norm": 0.5464332103729248, + "learning_rate": 7.482535182747799e-05, + "loss": 1.8989, + "step": 4983 + }, + { + "epoch": 1.512824404310214, + "grad_norm": 0.4606230556964874, + "learning_rate": 7.482028956160778e-05, + "loss": 1.8319, + "step": 4984 + }, + { + "epoch": 1.5131279405069056, + "grad_norm": 0.5384346842765808, + "learning_rate": 7.481522729573758e-05, + "loss": 1.8504, + "step": 4985 + }, + { + "epoch": 1.5134314767035968, + "grad_norm": 0.49810662865638733, + "learning_rate": 7.481016502986737e-05, + "loss": 1.7573, + "step": 4986 + }, + { + "epoch": 1.5137350129002884, + "grad_norm": 0.5364110469818115, + "learning_rate": 7.480510276399717e-05, + "loss": 1.8853, + "step": 4987 + }, + { + "epoch": 1.5140385490969797, + "grad_norm": 0.5262428522109985, + "learning_rate": 7.480004049812696e-05, + "loss": 1.9199, + "step": 4988 + }, + { + "epoch": 1.5143420852936713, + "grad_norm": 0.48900026082992554, + "learning_rate": 7.479497823225676e-05, + "loss": 1.8181, + "step": 4989 + }, + { + "epoch": 1.5146456214903627, + "grad_norm": 0.5142418742179871, + "learning_rate": 7.478991596638657e-05, + "loss": 1.801, + "step": 4990 + }, + { + "epoch": 1.5149491576870542, + "grad_norm": 0.5188509225845337, + "learning_rate": 7.478485370051636e-05, + "loss": 1.7511, + "step": 4991 + }, + { + "epoch": 1.5152526938837456, + "grad_norm": 0.5165709853172302, + "learning_rate": 7.477979143464616e-05, + "loss": 1.7038, + "step": 4992 + }, + { + "epoch": 1.515556230080437, + "grad_norm": 0.5722384452819824, + "learning_rate": 7.477472916877595e-05, + "loss": 1.7524, + "step": 4993 + }, + { + "epoch": 1.5158597662771287, + "grad_norm": 0.6268503665924072, + "learning_rate": 7.476966690290574e-05, + "loss": 1.7185, + "step": 4994 + }, + { + "epoch": 1.51616330247382, + "grad_norm": 0.8802040815353394, + "learning_rate": 7.476460463703554e-05, + "loss": 1.4555, + "step": 4995 + }, + { + "epoch": 1.5164668386705116, + "grad_norm": 0.5494096875190735, + "learning_rate": 7.475954237116533e-05, + "loss": 2.1006, + "step": 4996 + }, + { + "epoch": 1.5167703748672028, + "grad_norm": 0.4991436004638672, + "learning_rate": 7.475448010529513e-05, + "loss": 1.905, + "step": 4997 + }, + { + "epoch": 1.5170739110638944, + "grad_norm": 0.5484010577201843, + "learning_rate": 7.474941783942492e-05, + "loss": 1.7698, + "step": 4998 + }, + { + "epoch": 1.5173774472605859, + "grad_norm": 0.5888267755508423, + "learning_rate": 7.474435557355472e-05, + "loss": 1.591, + "step": 4999 + }, + { + "epoch": 1.5176809834572773, + "grad_norm": 0.6114574670791626, + "learning_rate": 7.473929330768453e-05, + "loss": 1.474, + "step": 5000 + }, + { + "epoch": 1.5179845196539687, + "grad_norm": 0.48462623357772827, + "learning_rate": 7.473423104181432e-05, + "loss": 1.3337, + "step": 5001 + }, + { + "epoch": 1.5182880558506602, + "grad_norm": 0.5276066660881042, + "learning_rate": 7.472916877594412e-05, + "loss": 1.7273, + "step": 5002 + }, + { + "epoch": 1.5185915920473516, + "grad_norm": 0.520941972732544, + "learning_rate": 7.472410651007391e-05, + "loss": 1.3722, + "step": 5003 + }, + { + "epoch": 1.518895128244043, + "grad_norm": 1.0179864168167114, + "learning_rate": 7.47190442442037e-05, + "loss": 1.4482, + "step": 5004 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.5575673580169678, + "learning_rate": 7.47139819783335e-05, + "loss": 1.6712, + "step": 5005 + }, + { + "epoch": 1.519502200637426, + "grad_norm": 0.6005712747573853, + "learning_rate": 7.47089197124633e-05, + "loss": 1.3345, + "step": 5006 + }, + { + "epoch": 1.5198057368341176, + "grad_norm": 0.8138278722763062, + "learning_rate": 7.470385744659309e-05, + "loss": 1.6787, + "step": 5007 + }, + { + "epoch": 1.5201092730308088, + "grad_norm": 0.5643385052680969, + "learning_rate": 7.469879518072289e-05, + "loss": 1.9712, + "step": 5008 + }, + { + "epoch": 1.5204128092275004, + "grad_norm": 0.6395078301429749, + "learning_rate": 7.46937329148527e-05, + "loss": 1.2771, + "step": 5009 + }, + { + "epoch": 1.5207163454241919, + "grad_norm": 0.5270484089851379, + "learning_rate": 7.468867064898249e-05, + "loss": 1.7923, + "step": 5010 + }, + { + "epoch": 1.5210198816208833, + "grad_norm": 0.7293537259101868, + "learning_rate": 7.468360838311228e-05, + "loss": 1.8557, + "step": 5011 + }, + { + "epoch": 1.5213234178175747, + "grad_norm": 0.4917014539241791, + "learning_rate": 7.467854611724208e-05, + "loss": 1.8161, + "step": 5012 + }, + { + "epoch": 1.5216269540142662, + "grad_norm": 0.4320582449436188, + "learning_rate": 7.467348385137187e-05, + "loss": 1.7172, + "step": 5013 + }, + { + "epoch": 1.5219304902109576, + "grad_norm": 0.4874110817909241, + "learning_rate": 7.466842158550167e-05, + "loss": 1.7386, + "step": 5014 + }, + { + "epoch": 1.522234026407649, + "grad_norm": 0.45594799518585205, + "learning_rate": 7.466335931963146e-05, + "loss": 1.8101, + "step": 5015 + }, + { + "epoch": 1.5225375626043407, + "grad_norm": 0.5213348269462585, + "learning_rate": 7.465829705376127e-05, + "loss": 1.5338, + "step": 5016 + }, + { + "epoch": 1.522841098801032, + "grad_norm": 0.5447767376899719, + "learning_rate": 7.465323478789107e-05, + "loss": 1.7583, + "step": 5017 + }, + { + "epoch": 1.5231446349977236, + "grad_norm": 0.45312610268592834, + "learning_rate": 7.464817252202086e-05, + "loss": 1.2033, + "step": 5018 + }, + { + "epoch": 1.5234481711944148, + "grad_norm": 0.8900678157806396, + "learning_rate": 7.464311025615066e-05, + "loss": 1.3666, + "step": 5019 + }, + { + "epoch": 1.5237517073911064, + "grad_norm": 0.6067972183227539, + "learning_rate": 7.463804799028046e-05, + "loss": 1.7647, + "step": 5020 + }, + { + "epoch": 1.5240552435877979, + "grad_norm": 0.5241597890853882, + "learning_rate": 7.463298572441026e-05, + "loss": 1.7492, + "step": 5021 + }, + { + "epoch": 1.5243587797844893, + "grad_norm": 0.5625531673431396, + "learning_rate": 7.462792345854005e-05, + "loss": 2.0467, + "step": 5022 + }, + { + "epoch": 1.5246623159811807, + "grad_norm": 0.4781433939933777, + "learning_rate": 7.462286119266985e-05, + "loss": 1.3643, + "step": 5023 + }, + { + "epoch": 1.5249658521778722, + "grad_norm": 0.6785547137260437, + "learning_rate": 7.461779892679964e-05, + "loss": 1.9067, + "step": 5024 + }, + { + "epoch": 1.5252693883745638, + "grad_norm": 0.5150321125984192, + "learning_rate": 7.461273666092944e-05, + "loss": 1.8381, + "step": 5025 + }, + { + "epoch": 1.525572924571255, + "grad_norm": 0.49465686082839966, + "learning_rate": 7.460767439505923e-05, + "loss": 1.5487, + "step": 5026 + }, + { + "epoch": 1.5258764607679467, + "grad_norm": 0.6422455906867981, + "learning_rate": 7.460261212918903e-05, + "loss": 1.8731, + "step": 5027 + }, + { + "epoch": 1.526179996964638, + "grad_norm": 0.5934743881225586, + "learning_rate": 7.459754986331882e-05, + "loss": 1.6849, + "step": 5028 + }, + { + "epoch": 1.5264835331613296, + "grad_norm": 0.4186806380748749, + "learning_rate": 7.459248759744863e-05, + "loss": 1.8836, + "step": 5029 + }, + { + "epoch": 1.526787069358021, + "grad_norm": 0.3892343044281006, + "learning_rate": 7.458742533157843e-05, + "loss": 1.5684, + "step": 5030 + }, + { + "epoch": 1.5270906055547124, + "grad_norm": 0.5003823041915894, + "learning_rate": 7.458236306570822e-05, + "loss": 1.2924, + "step": 5031 + }, + { + "epoch": 1.5273941417514039, + "grad_norm": 0.6574296355247498, + "learning_rate": 7.457730079983801e-05, + "loss": 1.8827, + "step": 5032 + }, + { + "epoch": 1.5276976779480953, + "grad_norm": 0.5175241231918335, + "learning_rate": 7.457223853396781e-05, + "loss": 1.5016, + "step": 5033 + }, + { + "epoch": 1.5280012141447867, + "grad_norm": 0.5171802043914795, + "learning_rate": 7.45671762680976e-05, + "loss": 1.7959, + "step": 5034 + }, + { + "epoch": 1.5283047503414782, + "grad_norm": 0.48254334926605225, + "learning_rate": 7.45621140022274e-05, + "loss": 1.8864, + "step": 5035 + }, + { + "epoch": 1.5286082865381698, + "grad_norm": 0.6125036478042603, + "learning_rate": 7.45570517363572e-05, + "loss": 1.6523, + "step": 5036 + }, + { + "epoch": 1.528911822734861, + "grad_norm": 0.6008560657501221, + "learning_rate": 7.455198947048699e-05, + "loss": 1.2628, + "step": 5037 + }, + { + "epoch": 1.5292153589315527, + "grad_norm": 0.4975941777229309, + "learning_rate": 7.454692720461678e-05, + "loss": 1.8079, + "step": 5038 + }, + { + "epoch": 1.529518895128244, + "grad_norm": 0.5716906785964966, + "learning_rate": 7.454186493874659e-05, + "loss": 1.1925, + "step": 5039 + }, + { + "epoch": 1.5298224313249356, + "grad_norm": 0.8717072606086731, + "learning_rate": 7.453680267287639e-05, + "loss": 1.8971, + "step": 5040 + }, + { + "epoch": 1.530125967521627, + "grad_norm": 0.533805251121521, + "learning_rate": 7.453174040700618e-05, + "loss": 1.5106, + "step": 5041 + }, + { + "epoch": 1.5304295037183184, + "grad_norm": 1.7133526802062988, + "learning_rate": 7.452667814113598e-05, + "loss": 1.8119, + "step": 5042 + }, + { + "epoch": 1.5307330399150099, + "grad_norm": 0.5104836225509644, + "learning_rate": 7.452161587526577e-05, + "loss": 1.7062, + "step": 5043 + }, + { + "epoch": 1.5310365761117013, + "grad_norm": 0.4728488028049469, + "learning_rate": 7.451655360939557e-05, + "loss": 1.494, + "step": 5044 + }, + { + "epoch": 1.5313401123083927, + "grad_norm": 0.511103093624115, + "learning_rate": 7.451149134352536e-05, + "loss": 1.2143, + "step": 5045 + }, + { + "epoch": 1.5316436485050842, + "grad_norm": 0.5534620881080627, + "learning_rate": 7.450642907765516e-05, + "loss": 1.6128, + "step": 5046 + }, + { + "epoch": 1.5319471847017758, + "grad_norm": 0.5195817351341248, + "learning_rate": 7.450136681178495e-05, + "loss": 1.7958, + "step": 5047 + }, + { + "epoch": 1.532250720898467, + "grad_norm": 0.6067714095115662, + "learning_rate": 7.449630454591476e-05, + "loss": 1.8131, + "step": 5048 + }, + { + "epoch": 1.5325542570951587, + "grad_norm": 0.62360018491745, + "learning_rate": 7.449124228004455e-05, + "loss": 2.1314, + "step": 5049 + }, + { + "epoch": 1.53285779329185, + "grad_norm": 0.5383068323135376, + "learning_rate": 7.448618001417435e-05, + "loss": 1.8666, + "step": 5050 + }, + { + "epoch": 1.5331613294885416, + "grad_norm": 0.5535653829574585, + "learning_rate": 7.448111774830414e-05, + "loss": 1.5019, + "step": 5051 + }, + { + "epoch": 1.533464865685233, + "grad_norm": 0.5406197309494019, + "learning_rate": 7.447605548243394e-05, + "loss": 1.6894, + "step": 5052 + }, + { + "epoch": 1.5337684018819244, + "grad_norm": 0.5757835507392883, + "learning_rate": 7.447099321656373e-05, + "loss": 1.1037, + "step": 5053 + }, + { + "epoch": 1.5340719380786159, + "grad_norm": 0.6058068871498108, + "learning_rate": 7.446593095069353e-05, + "loss": 1.4349, + "step": 5054 + }, + { + "epoch": 1.5343754742753073, + "grad_norm": 0.5739513039588928, + "learning_rate": 7.446086868482332e-05, + "loss": 1.8045, + "step": 5055 + }, + { + "epoch": 1.534679010471999, + "grad_norm": 0.5705426931381226, + "learning_rate": 7.445580641895312e-05, + "loss": 1.8432, + "step": 5056 + }, + { + "epoch": 1.5349825466686902, + "grad_norm": 0.46640723943710327, + "learning_rate": 7.445074415308293e-05, + "loss": 1.7681, + "step": 5057 + }, + { + "epoch": 1.5352860828653818, + "grad_norm": 0.5512862801551819, + "learning_rate": 7.444568188721272e-05, + "loss": 1.8849, + "step": 5058 + }, + { + "epoch": 1.535589619062073, + "grad_norm": 0.5038082003593445, + "learning_rate": 7.444061962134251e-05, + "loss": 1.3138, + "step": 5059 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.5421281456947327, + "learning_rate": 7.443555735547232e-05, + "loss": 1.4232, + "step": 5060 + }, + { + "epoch": 1.536196691455456, + "grad_norm": 0.5326946973800659, + "learning_rate": 7.443049508960212e-05, + "loss": 1.7202, + "step": 5061 + }, + { + "epoch": 1.5365002276521476, + "grad_norm": 0.4917674660682678, + "learning_rate": 7.442543282373191e-05, + "loss": 1.8367, + "step": 5062 + }, + { + "epoch": 1.536803763848839, + "grad_norm": 0.49285170435905457, + "learning_rate": 7.442037055786171e-05, + "loss": 1.6482, + "step": 5063 + }, + { + "epoch": 1.5371073000455304, + "grad_norm": 0.5591158270835876, + "learning_rate": 7.44153082919915e-05, + "loss": 1.943, + "step": 5064 + }, + { + "epoch": 1.5374108362422219, + "grad_norm": 0.531862199306488, + "learning_rate": 7.44102460261213e-05, + "loss": 1.7295, + "step": 5065 + }, + { + "epoch": 1.5377143724389133, + "grad_norm": 0.7069116234779358, + "learning_rate": 7.440518376025109e-05, + "loss": 1.5248, + "step": 5066 + }, + { + "epoch": 1.538017908635605, + "grad_norm": 0.5532852411270142, + "learning_rate": 7.440012149438089e-05, + "loss": 1.5866, + "step": 5067 + }, + { + "epoch": 1.5383214448322962, + "grad_norm": 0.4988435208797455, + "learning_rate": 7.43950592285107e-05, + "loss": 1.711, + "step": 5068 + }, + { + "epoch": 1.5386249810289878, + "grad_norm": 0.490222305059433, + "learning_rate": 7.438999696264049e-05, + "loss": 1.8174, + "step": 5069 + }, + { + "epoch": 1.538928517225679, + "grad_norm": 0.5370107889175415, + "learning_rate": 7.438493469677028e-05, + "loss": 1.8036, + "step": 5070 + }, + { + "epoch": 1.5392320534223707, + "grad_norm": 0.5577163100242615, + "learning_rate": 7.437987243090008e-05, + "loss": 1.4909, + "step": 5071 + }, + { + "epoch": 1.5395355896190621, + "grad_norm": 0.49091118574142456, + "learning_rate": 7.437481016502987e-05, + "loss": 1.1968, + "step": 5072 + }, + { + "epoch": 1.5398391258157536, + "grad_norm": 0.5760853290557861, + "learning_rate": 7.436974789915967e-05, + "loss": 1.7479, + "step": 5073 + }, + { + "epoch": 1.540142662012445, + "grad_norm": 0.4731634855270386, + "learning_rate": 7.436468563328946e-05, + "loss": 1.7511, + "step": 5074 + }, + { + "epoch": 1.5404461982091364, + "grad_norm": 0.5668302774429321, + "learning_rate": 7.435962336741926e-05, + "loss": 1.5773, + "step": 5075 + }, + { + "epoch": 1.5407497344058279, + "grad_norm": 0.477153480052948, + "learning_rate": 7.435456110154905e-05, + "loss": 1.783, + "step": 5076 + }, + { + "epoch": 1.5410532706025193, + "grad_norm": 0.532961368560791, + "learning_rate": 7.434949883567885e-05, + "loss": 1.815, + "step": 5077 + }, + { + "epoch": 1.541356806799211, + "grad_norm": 0.584200382232666, + "learning_rate": 7.434443656980866e-05, + "loss": 1.5517, + "step": 5078 + }, + { + "epoch": 1.5416603429959022, + "grad_norm": 0.6001595258712769, + "learning_rate": 7.433937430393845e-05, + "loss": 1.5844, + "step": 5079 + }, + { + "epoch": 1.5419638791925938, + "grad_norm": 0.5196036100387573, + "learning_rate": 7.433431203806825e-05, + "loss": 1.5434, + "step": 5080 + }, + { + "epoch": 1.542267415389285, + "grad_norm": 0.5187863111495972, + "learning_rate": 7.432924977219804e-05, + "loss": 1.5561, + "step": 5081 + }, + { + "epoch": 1.5425709515859767, + "grad_norm": 0.6012201309204102, + "learning_rate": 7.432418750632784e-05, + "loss": 1.31, + "step": 5082 + }, + { + "epoch": 1.5428744877826681, + "grad_norm": 0.5970558524131775, + "learning_rate": 7.431912524045763e-05, + "loss": 1.8189, + "step": 5083 + }, + { + "epoch": 1.5431780239793595, + "grad_norm": 0.5802868008613586, + "learning_rate": 7.431406297458743e-05, + "loss": 1.6781, + "step": 5084 + }, + { + "epoch": 1.543481560176051, + "grad_norm": 0.48958060145378113, + "learning_rate": 7.430900070871722e-05, + "loss": 1.2931, + "step": 5085 + }, + { + "epoch": 1.5437850963727424, + "grad_norm": 0.47764867544174194, + "learning_rate": 7.430393844284701e-05, + "loss": 1.8627, + "step": 5086 + }, + { + "epoch": 1.5440886325694338, + "grad_norm": 0.6315385103225708, + "learning_rate": 7.429887617697682e-05, + "loss": 1.1492, + "step": 5087 + }, + { + "epoch": 1.5443921687661253, + "grad_norm": 0.6129793524742126, + "learning_rate": 7.429381391110662e-05, + "loss": 1.6304, + "step": 5088 + }, + { + "epoch": 1.544695704962817, + "grad_norm": 1.4443278312683105, + "learning_rate": 7.428875164523641e-05, + "loss": 1.9673, + "step": 5089 + }, + { + "epoch": 1.5449992411595082, + "grad_norm": 0.596580445766449, + "learning_rate": 7.428368937936621e-05, + "loss": 1.7126, + "step": 5090 + }, + { + "epoch": 1.5453027773561998, + "grad_norm": 0.5361682772636414, + "learning_rate": 7.4278627113496e-05, + "loss": 1.8962, + "step": 5091 + }, + { + "epoch": 1.545606313552891, + "grad_norm": 0.5299656987190247, + "learning_rate": 7.42735648476258e-05, + "loss": 1.7639, + "step": 5092 + }, + { + "epoch": 1.5459098497495827, + "grad_norm": 0.4721045196056366, + "learning_rate": 7.426850258175559e-05, + "loss": 1.7856, + "step": 5093 + }, + { + "epoch": 1.546213385946274, + "grad_norm": 0.5393961668014526, + "learning_rate": 7.426344031588539e-05, + "loss": 1.7939, + "step": 5094 + }, + { + "epoch": 1.5465169221429655, + "grad_norm": 0.48364582657814026, + "learning_rate": 7.425837805001518e-05, + "loss": 1.7527, + "step": 5095 + }, + { + "epoch": 1.546820458339657, + "grad_norm": 0.5363742113113403, + "learning_rate": 7.425331578414499e-05, + "loss": 1.445, + "step": 5096 + }, + { + "epoch": 1.5471239945363484, + "grad_norm": 0.676005482673645, + "learning_rate": 7.424825351827478e-05, + "loss": 1.8364, + "step": 5097 + }, + { + "epoch": 1.54742753073304, + "grad_norm": 0.5144119262695312, + "learning_rate": 7.424319125240458e-05, + "loss": 1.2264, + "step": 5098 + }, + { + "epoch": 1.5477310669297313, + "grad_norm": 0.6479708552360535, + "learning_rate": 7.423812898653437e-05, + "loss": 1.4466, + "step": 5099 + }, + { + "epoch": 1.548034603126423, + "grad_norm": 0.6273570656776428, + "learning_rate": 7.423306672066417e-05, + "loss": 1.7581, + "step": 5100 + }, + { + "epoch": 1.5483381393231141, + "grad_norm": 0.4922904372215271, + "learning_rate": 7.422800445479396e-05, + "loss": 1.6592, + "step": 5101 + }, + { + "epoch": 1.5486416755198058, + "grad_norm": 0.5469740033149719, + "learning_rate": 7.422294218892376e-05, + "loss": 1.8006, + "step": 5102 + }, + { + "epoch": 1.5489452117164972, + "grad_norm": 0.5148813128471375, + "learning_rate": 7.421787992305355e-05, + "loss": 1.4116, + "step": 5103 + }, + { + "epoch": 1.5492487479131887, + "grad_norm": 0.5936328768730164, + "learning_rate": 7.421281765718335e-05, + "loss": 1.7438, + "step": 5104 + }, + { + "epoch": 1.54955228410988, + "grad_norm": 0.6445397138595581, + "learning_rate": 7.420775539131316e-05, + "loss": 1.5208, + "step": 5105 + }, + { + "epoch": 1.5498558203065715, + "grad_norm": 0.431699275970459, + "learning_rate": 7.420269312544295e-05, + "loss": 0.9626, + "step": 5106 + }, + { + "epoch": 1.550159356503263, + "grad_norm": 0.5394765734672546, + "learning_rate": 7.419763085957276e-05, + "loss": 1.8403, + "step": 5107 + }, + { + "epoch": 1.5504628926999544, + "grad_norm": 0.5958315134048462, + "learning_rate": 7.419256859370255e-05, + "loss": 1.7594, + "step": 5108 + }, + { + "epoch": 1.550766428896646, + "grad_norm": 0.5397548675537109, + "learning_rate": 7.418750632783235e-05, + "loss": 1.4339, + "step": 5109 + }, + { + "epoch": 1.5510699650933373, + "grad_norm": 0.6198435425758362, + "learning_rate": 7.418244406196214e-05, + "loss": 1.5588, + "step": 5110 + }, + { + "epoch": 1.551373501290029, + "grad_norm": 0.5071767568588257, + "learning_rate": 7.417738179609194e-05, + "loss": 1.7752, + "step": 5111 + }, + { + "epoch": 1.5516770374867201, + "grad_norm": 0.5771663784980774, + "learning_rate": 7.417231953022173e-05, + "loss": 1.8022, + "step": 5112 + }, + { + "epoch": 1.5519805736834118, + "grad_norm": 0.4798254668712616, + "learning_rate": 7.416725726435153e-05, + "loss": 1.6264, + "step": 5113 + }, + { + "epoch": 1.5522841098801032, + "grad_norm": 0.5879905819892883, + "learning_rate": 7.416219499848132e-05, + "loss": 1.7573, + "step": 5114 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.5705510973930359, + "learning_rate": 7.415713273261112e-05, + "loss": 1.7869, + "step": 5115 + }, + { + "epoch": 1.552891182273486, + "grad_norm": 0.5407357215881348, + "learning_rate": 7.415207046674091e-05, + "loss": 1.7814, + "step": 5116 + }, + { + "epoch": 1.5531947184701775, + "grad_norm": 0.5827559232711792, + "learning_rate": 7.414700820087072e-05, + "loss": 1.6038, + "step": 5117 + }, + { + "epoch": 1.553498254666869, + "grad_norm": 0.49504661560058594, + "learning_rate": 7.414194593500052e-05, + "loss": 1.3291, + "step": 5118 + }, + { + "epoch": 1.5538017908635604, + "grad_norm": 0.6688182353973389, + "learning_rate": 7.413688366913031e-05, + "loss": 1.8172, + "step": 5119 + }, + { + "epoch": 1.554105327060252, + "grad_norm": 0.5043573379516602, + "learning_rate": 7.41318214032601e-05, + "loss": 1.7996, + "step": 5120 + }, + { + "epoch": 1.5544088632569433, + "grad_norm": 0.508563220500946, + "learning_rate": 7.41267591373899e-05, + "loss": 1.6502, + "step": 5121 + }, + { + "epoch": 1.554712399453635, + "grad_norm": 0.5409261584281921, + "learning_rate": 7.41216968715197e-05, + "loss": 1.8077, + "step": 5122 + }, + { + "epoch": 1.5550159356503261, + "grad_norm": 0.6074355244636536, + "learning_rate": 7.411663460564949e-05, + "loss": 2.22, + "step": 5123 + }, + { + "epoch": 1.5553194718470178, + "grad_norm": 0.5736792087554932, + "learning_rate": 7.411157233977928e-05, + "loss": 1.5849, + "step": 5124 + }, + { + "epoch": 1.5556230080437092, + "grad_norm": 0.5450262427330017, + "learning_rate": 7.410651007390908e-05, + "loss": 1.3948, + "step": 5125 + }, + { + "epoch": 1.5559265442404007, + "grad_norm": 0.3917832672595978, + "learning_rate": 7.410144780803889e-05, + "loss": 1.2642, + "step": 5126 + }, + { + "epoch": 1.556230080437092, + "grad_norm": 0.6708034873008728, + "learning_rate": 7.409638554216868e-05, + "loss": 1.4424, + "step": 5127 + }, + { + "epoch": 1.5565336166337835, + "grad_norm": 0.442088782787323, + "learning_rate": 7.409132327629848e-05, + "loss": 1.663, + "step": 5128 + }, + { + "epoch": 1.5568371528304752, + "grad_norm": 0.5082998871803284, + "learning_rate": 7.408626101042827e-05, + "loss": 1.5504, + "step": 5129 + }, + { + "epoch": 1.5571406890271664, + "grad_norm": 0.5985785722732544, + "learning_rate": 7.408119874455807e-05, + "loss": 1.5082, + "step": 5130 + }, + { + "epoch": 1.557444225223858, + "grad_norm": 0.5707488656044006, + "learning_rate": 7.407613647868786e-05, + "loss": 1.7492, + "step": 5131 + }, + { + "epoch": 1.5577477614205493, + "grad_norm": 0.5827080011367798, + "learning_rate": 7.407107421281766e-05, + "loss": 1.5818, + "step": 5132 + }, + { + "epoch": 1.558051297617241, + "grad_norm": 0.5113789439201355, + "learning_rate": 7.406601194694745e-05, + "loss": 1.7846, + "step": 5133 + }, + { + "epoch": 1.5583548338139324, + "grad_norm": 1.9733306169509888, + "learning_rate": 7.406094968107725e-05, + "loss": 1.667, + "step": 5134 + }, + { + "epoch": 1.5586583700106238, + "grad_norm": 0.5422725081443787, + "learning_rate": 7.405588741520705e-05, + "loss": 1.5454, + "step": 5135 + }, + { + "epoch": 1.5589619062073152, + "grad_norm": 0.5780110955238342, + "learning_rate": 7.405082514933685e-05, + "loss": 1.8285, + "step": 5136 + }, + { + "epoch": 1.5592654424040067, + "grad_norm": 0.5252494215965271, + "learning_rate": 7.404576288346664e-05, + "loss": 2.0586, + "step": 5137 + }, + { + "epoch": 1.559568978600698, + "grad_norm": 0.560908854007721, + "learning_rate": 7.404070061759644e-05, + "loss": 1.8195, + "step": 5138 + }, + { + "epoch": 1.5598725147973895, + "grad_norm": 0.46185895800590515, + "learning_rate": 7.403563835172623e-05, + "loss": 1.244, + "step": 5139 + }, + { + "epoch": 1.5601760509940812, + "grad_norm": 0.5307225584983826, + "learning_rate": 7.403057608585603e-05, + "loss": 1.9262, + "step": 5140 + }, + { + "epoch": 1.5604795871907724, + "grad_norm": 0.49698513746261597, + "learning_rate": 7.402551381998582e-05, + "loss": 1.7835, + "step": 5141 + }, + { + "epoch": 1.560783123387464, + "grad_norm": 0.5683524012565613, + "learning_rate": 7.402045155411562e-05, + "loss": 1.7943, + "step": 5142 + }, + { + "epoch": 1.5610866595841553, + "grad_norm": 1.0103782415390015, + "learning_rate": 7.401538928824541e-05, + "loss": 1.7038, + "step": 5143 + }, + { + "epoch": 1.561390195780847, + "grad_norm": 0.5294140577316284, + "learning_rate": 7.401032702237521e-05, + "loss": 1.5177, + "step": 5144 + }, + { + "epoch": 1.5616937319775384, + "grad_norm": 0.5453392267227173, + "learning_rate": 7.400526475650502e-05, + "loss": 1.6897, + "step": 5145 + }, + { + "epoch": 1.5619972681742298, + "grad_norm": 0.5333344340324402, + "learning_rate": 7.400020249063481e-05, + "loss": 1.8362, + "step": 5146 + }, + { + "epoch": 1.5623008043709212, + "grad_norm": 0.5554139614105225, + "learning_rate": 7.39951402247646e-05, + "loss": 1.7446, + "step": 5147 + }, + { + "epoch": 1.5626043405676127, + "grad_norm": 0.5534631013870239, + "learning_rate": 7.39900779588944e-05, + "loss": 1.8738, + "step": 5148 + }, + { + "epoch": 1.562907876764304, + "grad_norm": 0.5668753981590271, + "learning_rate": 7.398501569302421e-05, + "loss": 1.7437, + "step": 5149 + }, + { + "epoch": 1.5632114129609955, + "grad_norm": 0.5681511163711548, + "learning_rate": 7.3979953427154e-05, + "loss": 1.6496, + "step": 5150 + }, + { + "epoch": 1.5635149491576872, + "grad_norm": 0.6148278713226318, + "learning_rate": 7.39748911612838e-05, + "loss": 1.6409, + "step": 5151 + }, + { + "epoch": 1.5638184853543784, + "grad_norm": 0.529887318611145, + "learning_rate": 7.39698288954136e-05, + "loss": 1.7175, + "step": 5152 + }, + { + "epoch": 1.56412202155107, + "grad_norm": 0.541483461856842, + "learning_rate": 7.396476662954339e-05, + "loss": 2.0849, + "step": 5153 + }, + { + "epoch": 1.5644255577477613, + "grad_norm": 0.4464404582977295, + "learning_rate": 7.395970436367318e-05, + "loss": 1.376, + "step": 5154 + }, + { + "epoch": 1.564729093944453, + "grad_norm": 0.8220679759979248, + "learning_rate": 7.395464209780298e-05, + "loss": 1.1533, + "step": 5155 + }, + { + "epoch": 1.5650326301411444, + "grad_norm": 0.5027211308479309, + "learning_rate": 7.394957983193279e-05, + "loss": 1.648, + "step": 5156 + }, + { + "epoch": 1.5653361663378358, + "grad_norm": 0.5189235210418701, + "learning_rate": 7.394451756606258e-05, + "loss": 1.4195, + "step": 5157 + }, + { + "epoch": 1.5656397025345272, + "grad_norm": 0.5146520137786865, + "learning_rate": 7.393945530019238e-05, + "loss": 1.2644, + "step": 5158 + }, + { + "epoch": 1.5659432387312187, + "grad_norm": 0.6035975217819214, + "learning_rate": 7.393439303432217e-05, + "loss": 1.6141, + "step": 5159 + }, + { + "epoch": 1.5662467749279103, + "grad_norm": 0.5004559755325317, + "learning_rate": 7.392933076845197e-05, + "loss": 1.7301, + "step": 5160 + }, + { + "epoch": 1.5665503111246015, + "grad_norm": 0.5110512375831604, + "learning_rate": 7.392426850258176e-05, + "loss": 1.7182, + "step": 5161 + }, + { + "epoch": 1.5668538473212932, + "grad_norm": 0.6292902827262878, + "learning_rate": 7.391920623671155e-05, + "loss": 1.3006, + "step": 5162 + }, + { + "epoch": 1.5671573835179844, + "grad_norm": 0.6202772259712219, + "learning_rate": 7.391414397084135e-05, + "loss": 1.8302, + "step": 5163 + }, + { + "epoch": 1.567460919714676, + "grad_norm": 0.5574930310249329, + "learning_rate": 7.390908170497114e-05, + "loss": 1.587, + "step": 5164 + }, + { + "epoch": 1.5677644559113675, + "grad_norm": 0.6311307549476624, + "learning_rate": 7.390401943910095e-05, + "loss": 1.8448, + "step": 5165 + }, + { + "epoch": 1.568067992108059, + "grad_norm": 0.5117753148078918, + "learning_rate": 7.389895717323075e-05, + "loss": 1.7946, + "step": 5166 + }, + { + "epoch": 1.5683715283047504, + "grad_norm": 1.1032054424285889, + "learning_rate": 7.389389490736054e-05, + "loss": 1.3926, + "step": 5167 + }, + { + "epoch": 1.5686750645014418, + "grad_norm": 0.5589320659637451, + "learning_rate": 7.388883264149034e-05, + "loss": 1.8236, + "step": 5168 + }, + { + "epoch": 1.5689786006981332, + "grad_norm": 0.5848040580749512, + "learning_rate": 7.388377037562013e-05, + "loss": 1.6766, + "step": 5169 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.45275813341140747, + "learning_rate": 7.387870810974993e-05, + "loss": 1.7592, + "step": 5170 + }, + { + "epoch": 1.5695856730915163, + "grad_norm": 0.5285073518753052, + "learning_rate": 7.387364584387972e-05, + "loss": 1.7066, + "step": 5171 + }, + { + "epoch": 1.5698892092882075, + "grad_norm": 0.46561524271965027, + "learning_rate": 7.386858357800952e-05, + "loss": 1.7283, + "step": 5172 + }, + { + "epoch": 1.5701927454848992, + "grad_norm": 0.570890486240387, + "learning_rate": 7.386352131213931e-05, + "loss": 1.7499, + "step": 5173 + }, + { + "epoch": 1.5704962816815904, + "grad_norm": 0.6721476316452026, + "learning_rate": 7.385845904626912e-05, + "loss": 1.4815, + "step": 5174 + }, + { + "epoch": 1.570799817878282, + "grad_norm": 0.4744941294193268, + "learning_rate": 7.385339678039891e-05, + "loss": 1.7145, + "step": 5175 + }, + { + "epoch": 1.5711033540749735, + "grad_norm": 0.560144305229187, + "learning_rate": 7.384833451452871e-05, + "loss": 1.9931, + "step": 5176 + }, + { + "epoch": 1.571406890271665, + "grad_norm": 0.5556525588035583, + "learning_rate": 7.38432722486585e-05, + "loss": 1.7716, + "step": 5177 + }, + { + "epoch": 1.5717104264683563, + "grad_norm": 0.5719964504241943, + "learning_rate": 7.38382099827883e-05, + "loss": 2.0032, + "step": 5178 + }, + { + "epoch": 1.5720139626650478, + "grad_norm": 0.4774825870990753, + "learning_rate": 7.38331477169181e-05, + "loss": 1.4463, + "step": 5179 + }, + { + "epoch": 1.5723174988617392, + "grad_norm": 0.5442168712615967, + "learning_rate": 7.382808545104789e-05, + "loss": 1.6563, + "step": 5180 + }, + { + "epoch": 1.5726210350584306, + "grad_norm": 0.5183798670768738, + "learning_rate": 7.382302318517768e-05, + "loss": 1.2414, + "step": 5181 + }, + { + "epoch": 1.5729245712551223, + "grad_norm": 0.574775755405426, + "learning_rate": 7.381796091930748e-05, + "loss": 1.7268, + "step": 5182 + }, + { + "epoch": 1.5732281074518135, + "grad_norm": 0.47192198038101196, + "learning_rate": 7.381289865343727e-05, + "loss": 1.2211, + "step": 5183 + }, + { + "epoch": 1.5735316436485052, + "grad_norm": 0.5779873132705688, + "learning_rate": 7.380783638756708e-05, + "loss": 1.5045, + "step": 5184 + }, + { + "epoch": 1.5738351798451964, + "grad_norm": 0.5421679019927979, + "learning_rate": 7.380277412169688e-05, + "loss": 1.6357, + "step": 5185 + }, + { + "epoch": 1.574138716041888, + "grad_norm": 0.8882037401199341, + "learning_rate": 7.379771185582667e-05, + "loss": 1.7545, + "step": 5186 + }, + { + "epoch": 1.5744422522385795, + "grad_norm": 0.5748177170753479, + "learning_rate": 7.379264958995647e-05, + "loss": 1.5829, + "step": 5187 + }, + { + "epoch": 1.574745788435271, + "grad_norm": 0.5343853831291199, + "learning_rate": 7.378758732408626e-05, + "loss": 1.7749, + "step": 5188 + }, + { + "epoch": 1.5750493246319623, + "grad_norm": 0.4772493243217468, + "learning_rate": 7.378252505821605e-05, + "loss": 1.6533, + "step": 5189 + }, + { + "epoch": 1.5753528608286538, + "grad_norm": 0.5794268250465393, + "learning_rate": 7.377746279234585e-05, + "loss": 1.6251, + "step": 5190 + }, + { + "epoch": 1.5756563970253454, + "grad_norm": 0.6607987880706787, + "learning_rate": 7.377240052647564e-05, + "loss": 1.3856, + "step": 5191 + }, + { + "epoch": 1.5759599332220366, + "grad_norm": 0.5267585515975952, + "learning_rate": 7.376733826060544e-05, + "loss": 1.7208, + "step": 5192 + }, + { + "epoch": 1.5762634694187283, + "grad_norm": 0.7312984466552734, + "learning_rate": 7.376227599473525e-05, + "loss": 1.8771, + "step": 5193 + }, + { + "epoch": 1.5765670056154195, + "grad_norm": 0.533445417881012, + "learning_rate": 7.375721372886504e-05, + "loss": 1.7044, + "step": 5194 + }, + { + "epoch": 1.5768705418121112, + "grad_norm": 0.5625982880592346, + "learning_rate": 7.375215146299485e-05, + "loss": 1.7064, + "step": 5195 + }, + { + "epoch": 1.5771740780088026, + "grad_norm": 0.5268564820289612, + "learning_rate": 7.374708919712465e-05, + "loss": 1.7857, + "step": 5196 + }, + { + "epoch": 1.577477614205494, + "grad_norm": 0.6782357096672058, + "learning_rate": 7.374202693125444e-05, + "loss": 1.7049, + "step": 5197 + }, + { + "epoch": 1.5777811504021855, + "grad_norm": 0.5990774035453796, + "learning_rate": 7.373696466538424e-05, + "loss": 1.806, + "step": 5198 + }, + { + "epoch": 1.578084686598877, + "grad_norm": 0.6708577275276184, + "learning_rate": 7.373190239951403e-05, + "loss": 1.4618, + "step": 5199 + }, + { + "epoch": 1.5783882227955683, + "grad_norm": 0.547925591468811, + "learning_rate": 7.372684013364382e-05, + "loss": 1.6713, + "step": 5200 + }, + { + "epoch": 1.5786917589922598, + "grad_norm": 0.5250493288040161, + "learning_rate": 7.372177786777362e-05, + "loss": 1.897, + "step": 5201 + }, + { + "epoch": 1.5789952951889514, + "grad_norm": 0.5222801566123962, + "learning_rate": 7.371671560190341e-05, + "loss": 1.8989, + "step": 5202 + }, + { + "epoch": 1.5792988313856426, + "grad_norm": 0.581599771976471, + "learning_rate": 7.371165333603321e-05, + "loss": 1.3431, + "step": 5203 + }, + { + "epoch": 1.5796023675823343, + "grad_norm": 0.5837922692298889, + "learning_rate": 7.370659107016302e-05, + "loss": 1.2763, + "step": 5204 + }, + { + "epoch": 1.5799059037790255, + "grad_norm": 0.5853214859962463, + "learning_rate": 7.370152880429281e-05, + "loss": 1.5066, + "step": 5205 + }, + { + "epoch": 1.5802094399757172, + "grad_norm": 0.5608053803443909, + "learning_rate": 7.369646653842261e-05, + "loss": 1.8226, + "step": 5206 + }, + { + "epoch": 1.5805129761724086, + "grad_norm": 0.5073212385177612, + "learning_rate": 7.36914042725524e-05, + "loss": 1.1189, + "step": 5207 + }, + { + "epoch": 1.5808165123691, + "grad_norm": 0.5363342761993408, + "learning_rate": 7.36863420066822e-05, + "loss": 1.5296, + "step": 5208 + }, + { + "epoch": 1.5811200485657915, + "grad_norm": 0.4550332725048065, + "learning_rate": 7.368127974081199e-05, + "loss": 1.8264, + "step": 5209 + }, + { + "epoch": 1.581423584762483, + "grad_norm": 0.5468671917915344, + "learning_rate": 7.367621747494179e-05, + "loss": 1.7837, + "step": 5210 + }, + { + "epoch": 1.5817271209591743, + "grad_norm": 0.5812280774116516, + "learning_rate": 7.367115520907158e-05, + "loss": 1.6072, + "step": 5211 + }, + { + "epoch": 1.5820306571558658, + "grad_norm": 0.6074302196502686, + "learning_rate": 7.366609294320138e-05, + "loss": 1.3217, + "step": 5212 + }, + { + "epoch": 1.5823341933525574, + "grad_norm": 0.8310289978981018, + "learning_rate": 7.366103067733118e-05, + "loss": 1.6822, + "step": 5213 + }, + { + "epoch": 1.5826377295492486, + "grad_norm": 0.5651637315750122, + "learning_rate": 7.365596841146098e-05, + "loss": 1.9405, + "step": 5214 + }, + { + "epoch": 1.5829412657459403, + "grad_norm": 0.5646255612373352, + "learning_rate": 7.365090614559077e-05, + "loss": 1.7228, + "step": 5215 + }, + { + "epoch": 1.5832448019426315, + "grad_norm": 0.621033787727356, + "learning_rate": 7.364584387972057e-05, + "loss": 1.2852, + "step": 5216 + }, + { + "epoch": 1.5835483381393232, + "grad_norm": 0.5037727952003479, + "learning_rate": 7.364078161385036e-05, + "loss": 1.3287, + "step": 5217 + }, + { + "epoch": 1.5838518743360146, + "grad_norm": 0.5539587736129761, + "learning_rate": 7.363571934798016e-05, + "loss": 1.2148, + "step": 5218 + }, + { + "epoch": 1.584155410532706, + "grad_norm": 0.48859071731567383, + "learning_rate": 7.363065708210995e-05, + "loss": 1.8287, + "step": 5219 + }, + { + "epoch": 1.5844589467293975, + "grad_norm": 0.964847981929779, + "learning_rate": 7.362559481623975e-05, + "loss": 1.7749, + "step": 5220 + }, + { + "epoch": 1.584762482926089, + "grad_norm": 0.5616121292114258, + "learning_rate": 7.362053255036954e-05, + "loss": 1.6908, + "step": 5221 + }, + { + "epoch": 1.5850660191227806, + "grad_norm": 0.5466740727424622, + "learning_rate": 7.361547028449934e-05, + "loss": 1.4226, + "step": 5222 + }, + { + "epoch": 1.5853695553194718, + "grad_norm": 0.4894787073135376, + "learning_rate": 7.361040801862915e-05, + "loss": 1.3254, + "step": 5223 + }, + { + "epoch": 1.5856730915161634, + "grad_norm": 0.4261757433414459, + "learning_rate": 7.360534575275894e-05, + "loss": 0.9082, + "step": 5224 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.5084283947944641, + "learning_rate": 7.360028348688874e-05, + "loss": 1.2136, + "step": 5225 + }, + { + "epoch": 1.5862801639095463, + "grad_norm": 0.41630545258522034, + "learning_rate": 7.359522122101853e-05, + "loss": 1.8932, + "step": 5226 + }, + { + "epoch": 1.5865837001062375, + "grad_norm": 0.5659027695655823, + "learning_rate": 7.359015895514832e-05, + "loss": 1.36, + "step": 5227 + }, + { + "epoch": 1.5868872363029292, + "grad_norm": 0.5066294074058533, + "learning_rate": 7.358509668927812e-05, + "loss": 1.6067, + "step": 5228 + }, + { + "epoch": 1.5871907724996206, + "grad_norm": 0.44448360800743103, + "learning_rate": 7.358003442340791e-05, + "loss": 1.1769, + "step": 5229 + }, + { + "epoch": 1.587494308696312, + "grad_norm": 0.5426762104034424, + "learning_rate": 7.357497215753771e-05, + "loss": 1.6513, + "step": 5230 + }, + { + "epoch": 1.5877978448930035, + "grad_norm": 0.5724853277206421, + "learning_rate": 7.35699098916675e-05, + "loss": 1.6179, + "step": 5231 + }, + { + "epoch": 1.588101381089695, + "grad_norm": 0.5723243951797485, + "learning_rate": 7.356484762579731e-05, + "loss": 2.1125, + "step": 5232 + }, + { + "epoch": 1.5884049172863866, + "grad_norm": 0.6052478551864624, + "learning_rate": 7.355978535992711e-05, + "loss": 1.6911, + "step": 5233 + }, + { + "epoch": 1.5887084534830778, + "grad_norm": 0.5020635724067688, + "learning_rate": 7.35547230940569e-05, + "loss": 1.8538, + "step": 5234 + }, + { + "epoch": 1.5890119896797694, + "grad_norm": 0.5227568745613098, + "learning_rate": 7.35496608281867e-05, + "loss": 1.906, + "step": 5235 + }, + { + "epoch": 1.5893155258764606, + "grad_norm": 0.549309492111206, + "learning_rate": 7.354459856231649e-05, + "loss": 1.363, + "step": 5236 + }, + { + "epoch": 1.5896190620731523, + "grad_norm": 0.4708143472671509, + "learning_rate": 7.353953629644629e-05, + "loss": 2.0159, + "step": 5237 + }, + { + "epoch": 1.5899225982698437, + "grad_norm": 0.5588541030883789, + "learning_rate": 7.35344740305761e-05, + "loss": 1.6558, + "step": 5238 + }, + { + "epoch": 1.5902261344665352, + "grad_norm": 0.4781373143196106, + "learning_rate": 7.352941176470589e-05, + "loss": 1.7517, + "step": 5239 + }, + { + "epoch": 1.5905296706632266, + "grad_norm": 0.5474836230278015, + "learning_rate": 7.352434949883568e-05, + "loss": 1.4177, + "step": 5240 + }, + { + "epoch": 1.590833206859918, + "grad_norm": 0.6164267659187317, + "learning_rate": 7.351928723296548e-05, + "loss": 1.5234, + "step": 5241 + }, + { + "epoch": 1.5911367430566095, + "grad_norm": 0.5702620148658752, + "learning_rate": 7.351422496709527e-05, + "loss": 1.5461, + "step": 5242 + }, + { + "epoch": 1.591440279253301, + "grad_norm": 0.609255313873291, + "learning_rate": 7.350916270122508e-05, + "loss": 1.9137, + "step": 5243 + }, + { + "epoch": 1.5917438154499925, + "grad_norm": 0.5648717880249023, + "learning_rate": 7.350410043535488e-05, + "loss": 1.7468, + "step": 5244 + }, + { + "epoch": 1.5920473516466838, + "grad_norm": 0.5698893070220947, + "learning_rate": 7.349903816948467e-05, + "loss": 1.4076, + "step": 5245 + }, + { + "epoch": 1.5923508878433754, + "grad_norm": 0.5639516711235046, + "learning_rate": 7.349397590361447e-05, + "loss": 1.9586, + "step": 5246 + }, + { + "epoch": 1.5926544240400666, + "grad_norm": 0.6253385543823242, + "learning_rate": 7.348891363774426e-05, + "loss": 1.5954, + "step": 5247 + }, + { + "epoch": 1.5929579602367583, + "grad_norm": 0.47530102729797363, + "learning_rate": 7.348385137187406e-05, + "loss": 1.371, + "step": 5248 + }, + { + "epoch": 1.5932614964334497, + "grad_norm": 0.7411981225013733, + "learning_rate": 7.347878910600385e-05, + "loss": 1.9518, + "step": 5249 + }, + { + "epoch": 1.5935650326301412, + "grad_norm": 1.0231763124465942, + "learning_rate": 7.347372684013365e-05, + "loss": 1.1718, + "step": 5250 + }, + { + "epoch": 1.5938685688268326, + "grad_norm": 0.6356399059295654, + "learning_rate": 7.346866457426344e-05, + "loss": 1.8131, + "step": 5251 + }, + { + "epoch": 1.594172105023524, + "grad_norm": 0.577565610408783, + "learning_rate": 7.346360230839325e-05, + "loss": 1.7185, + "step": 5252 + }, + { + "epoch": 1.5944756412202155, + "grad_norm": 0.6096293330192566, + "learning_rate": 7.345854004252304e-05, + "loss": 1.534, + "step": 5253 + }, + { + "epoch": 1.594779177416907, + "grad_norm": 0.5408341288566589, + "learning_rate": 7.345347777665284e-05, + "loss": 1.7644, + "step": 5254 + }, + { + "epoch": 1.5950827136135985, + "grad_norm": 0.5350682139396667, + "learning_rate": 7.344841551078263e-05, + "loss": 1.723, + "step": 5255 + }, + { + "epoch": 1.5953862498102898, + "grad_norm": 0.8048761487007141, + "learning_rate": 7.344335324491243e-05, + "loss": 1.5371, + "step": 5256 + }, + { + "epoch": 1.5956897860069814, + "grad_norm": 0.427385538816452, + "learning_rate": 7.343829097904222e-05, + "loss": 1.7227, + "step": 5257 + }, + { + "epoch": 1.5959933222036726, + "grad_norm": 0.5769528150558472, + "learning_rate": 7.343322871317202e-05, + "loss": 1.8178, + "step": 5258 + }, + { + "epoch": 1.5962968584003643, + "grad_norm": 0.6781782507896423, + "learning_rate": 7.342816644730181e-05, + "loss": 1.7434, + "step": 5259 + }, + { + "epoch": 1.5966003945970557, + "grad_norm": 0.5181839466094971, + "learning_rate": 7.342310418143161e-05, + "loss": 1.6126, + "step": 5260 + }, + { + "epoch": 1.5969039307937472, + "grad_norm": 0.5487415790557861, + "learning_rate": 7.34180419155614e-05, + "loss": 1.6194, + "step": 5261 + }, + { + "epoch": 1.5972074669904386, + "grad_norm": 0.5643243789672852, + "learning_rate": 7.341297964969121e-05, + "loss": 1.5133, + "step": 5262 + }, + { + "epoch": 1.59751100318713, + "grad_norm": 0.6052178144454956, + "learning_rate": 7.3407917383821e-05, + "loss": 1.7253, + "step": 5263 + }, + { + "epoch": 1.5978145393838217, + "grad_norm": 0.5676378607749939, + "learning_rate": 7.34028551179508e-05, + "loss": 1.7509, + "step": 5264 + }, + { + "epoch": 1.5981180755805129, + "grad_norm": 0.6112813949584961, + "learning_rate": 7.33977928520806e-05, + "loss": 1.7657, + "step": 5265 + }, + { + "epoch": 1.5984216117772045, + "grad_norm": 0.4967426657676697, + "learning_rate": 7.339273058621039e-05, + "loss": 1.7637, + "step": 5266 + }, + { + "epoch": 1.5987251479738958, + "grad_norm": 0.5924389958381653, + "learning_rate": 7.338766832034018e-05, + "loss": 1.7697, + "step": 5267 + }, + { + "epoch": 1.5990286841705874, + "grad_norm": 0.547675371170044, + "learning_rate": 7.338260605446998e-05, + "loss": 1.5096, + "step": 5268 + }, + { + "epoch": 1.5993322203672788, + "grad_norm": 0.5636522769927979, + "learning_rate": 7.337754378859977e-05, + "loss": 1.5292, + "step": 5269 + }, + { + "epoch": 1.5996357565639703, + "grad_norm": 0.5873835682868958, + "learning_rate": 7.337248152272957e-05, + "loss": 1.5638, + "step": 5270 + }, + { + "epoch": 1.5999392927606617, + "grad_norm": 0.548093855381012, + "learning_rate": 7.336741925685938e-05, + "loss": 0.982, + "step": 5271 + }, + { + "epoch": 1.6002428289573531, + "grad_norm": 0.4551286995410919, + "learning_rate": 7.336235699098917e-05, + "loss": 0.8414, + "step": 5272 + }, + { + "epoch": 1.6005463651540446, + "grad_norm": 0.4967573881149292, + "learning_rate": 7.335729472511897e-05, + "loss": 1.8692, + "step": 5273 + }, + { + "epoch": 1.600849901350736, + "grad_norm": 0.5112420916557312, + "learning_rate": 7.335223245924876e-05, + "loss": 1.7884, + "step": 5274 + }, + { + "epoch": 1.6011534375474277, + "grad_norm": 0.6654168367385864, + "learning_rate": 7.334717019337856e-05, + "loss": 1.8305, + "step": 5275 + }, + { + "epoch": 1.6014569737441189, + "grad_norm": 0.5480862855911255, + "learning_rate": 7.334210792750835e-05, + "loss": 1.5863, + "step": 5276 + }, + { + "epoch": 1.6017605099408105, + "grad_norm": 0.5990899205207825, + "learning_rate": 7.333704566163815e-05, + "loss": 1.2868, + "step": 5277 + }, + { + "epoch": 1.6020640461375018, + "grad_norm": 0.5188322067260742, + "learning_rate": 7.333198339576794e-05, + "loss": 1.8025, + "step": 5278 + }, + { + "epoch": 1.6023675823341934, + "grad_norm": 0.5856629014015198, + "learning_rate": 7.332692112989774e-05, + "loss": 1.3265, + "step": 5279 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.545465350151062, + "learning_rate": 7.332185886402754e-05, + "loss": 1.862, + "step": 5280 + }, + { + "epoch": 1.6029746547275763, + "grad_norm": 0.4535203278064728, + "learning_rate": 7.331679659815734e-05, + "loss": 1.9352, + "step": 5281 + }, + { + "epoch": 1.6032781909242677, + "grad_norm": 0.6021257638931274, + "learning_rate": 7.331173433228713e-05, + "loss": 1.7702, + "step": 5282 + }, + { + "epoch": 1.6035817271209591, + "grad_norm": 0.6483513712882996, + "learning_rate": 7.330667206641694e-05, + "loss": 1.3199, + "step": 5283 + }, + { + "epoch": 1.6038852633176506, + "grad_norm": 0.46683990955352783, + "learning_rate": 7.330160980054674e-05, + "loss": 1.8755, + "step": 5284 + }, + { + "epoch": 1.604188799514342, + "grad_norm": 0.55056232213974, + "learning_rate": 7.329654753467653e-05, + "loss": 1.1878, + "step": 5285 + }, + { + "epoch": 1.6044923357110337, + "grad_norm": 0.572959303855896, + "learning_rate": 7.329148526880633e-05, + "loss": 1.3519, + "step": 5286 + }, + { + "epoch": 1.6047958719077249, + "grad_norm": 0.6803028583526611, + "learning_rate": 7.328642300293612e-05, + "loss": 1.3468, + "step": 5287 + }, + { + "epoch": 1.6050994081044165, + "grad_norm": 0.6406558156013489, + "learning_rate": 7.328136073706592e-05, + "loss": 1.3617, + "step": 5288 + }, + { + "epoch": 1.6054029443011077, + "grad_norm": 0.8463157415390015, + "learning_rate": 7.327629847119571e-05, + "loss": 1.4584, + "step": 5289 + }, + { + "epoch": 1.6057064804977994, + "grad_norm": 0.6122679114341736, + "learning_rate": 7.32712362053255e-05, + "loss": 1.5421, + "step": 5290 + }, + { + "epoch": 1.6060100166944908, + "grad_norm": 0.4659373164176941, + "learning_rate": 7.326617393945531e-05, + "loss": 1.6206, + "step": 5291 + }, + { + "epoch": 1.6063135528911823, + "grad_norm": 0.5055060386657715, + "learning_rate": 7.326111167358511e-05, + "loss": 2.0784, + "step": 5292 + }, + { + "epoch": 1.6066170890878737, + "grad_norm": 0.5983397960662842, + "learning_rate": 7.32560494077149e-05, + "loss": 1.7021, + "step": 5293 + }, + { + "epoch": 1.6069206252845651, + "grad_norm": 0.5488964915275574, + "learning_rate": 7.32509871418447e-05, + "loss": 1.8662, + "step": 5294 + }, + { + "epoch": 1.6072241614812568, + "grad_norm": 0.48650479316711426, + "learning_rate": 7.324592487597449e-05, + "loss": 1.6154, + "step": 5295 + }, + { + "epoch": 1.607527697677948, + "grad_norm": 0.5552559494972229, + "learning_rate": 7.324086261010429e-05, + "loss": 1.1789, + "step": 5296 + }, + { + "epoch": 1.6078312338746397, + "grad_norm": 0.595207691192627, + "learning_rate": 7.323580034423408e-05, + "loss": 1.2947, + "step": 5297 + }, + { + "epoch": 1.6081347700713309, + "grad_norm": 0.5485917925834656, + "learning_rate": 7.323073807836388e-05, + "loss": 1.7643, + "step": 5298 + }, + { + "epoch": 1.6084383062680225, + "grad_norm": 0.5586134791374207, + "learning_rate": 7.322567581249367e-05, + "loss": 1.7608, + "step": 5299 + }, + { + "epoch": 1.608741842464714, + "grad_norm": 0.5535512566566467, + "learning_rate": 7.322061354662347e-05, + "loss": 1.7434, + "step": 5300 + }, + { + "epoch": 1.6090453786614054, + "grad_norm": 0.540846586227417, + "learning_rate": 7.321555128075328e-05, + "loss": 1.7094, + "step": 5301 + }, + { + "epoch": 1.6093489148580968, + "grad_norm": 0.5523681044578552, + "learning_rate": 7.321048901488307e-05, + "loss": 1.6085, + "step": 5302 + }, + { + "epoch": 1.6096524510547883, + "grad_norm": 0.4929633140563965, + "learning_rate": 7.320542674901286e-05, + "loss": 1.148, + "step": 5303 + }, + { + "epoch": 1.6099559872514797, + "grad_norm": 0.8295103311538696, + "learning_rate": 7.320036448314266e-05, + "loss": 1.7371, + "step": 5304 + }, + { + "epoch": 1.6102595234481711, + "grad_norm": 0.5253877639770508, + "learning_rate": 7.319530221727245e-05, + "loss": 1.7138, + "step": 5305 + }, + { + "epoch": 1.6105630596448628, + "grad_norm": 0.5233611464500427, + "learning_rate": 7.319023995140225e-05, + "loss": 1.8088, + "step": 5306 + }, + { + "epoch": 1.610866595841554, + "grad_norm": 1.7711957693099976, + "learning_rate": 7.318517768553204e-05, + "loss": 1.4473, + "step": 5307 + }, + { + "epoch": 1.6111701320382457, + "grad_norm": 0.6009371876716614, + "learning_rate": 7.318011541966184e-05, + "loss": 1.8576, + "step": 5308 + }, + { + "epoch": 1.6114736682349369, + "grad_norm": 0.4875546395778656, + "learning_rate": 7.317505315379163e-05, + "loss": 1.5732, + "step": 5309 + }, + { + "epoch": 1.6117772044316285, + "grad_norm": 0.5272681713104248, + "learning_rate": 7.316999088792144e-05, + "loss": 1.8052, + "step": 5310 + }, + { + "epoch": 1.61208074062832, + "grad_norm": 0.5559819340705872, + "learning_rate": 7.316492862205124e-05, + "loss": 1.3846, + "step": 5311 + }, + { + "epoch": 1.6123842768250114, + "grad_norm": 0.5686014890670776, + "learning_rate": 7.315986635618103e-05, + "loss": 1.8608, + "step": 5312 + }, + { + "epoch": 1.6126878130217028, + "grad_norm": 0.533433198928833, + "learning_rate": 7.315480409031083e-05, + "loss": 1.516, + "step": 5313 + }, + { + "epoch": 1.6129913492183943, + "grad_norm": 0.5107302665710449, + "learning_rate": 7.314974182444062e-05, + "loss": 1.9804, + "step": 5314 + }, + { + "epoch": 1.6132948854150857, + "grad_norm": 0.48063787817955017, + "learning_rate": 7.314467955857042e-05, + "loss": 1.5903, + "step": 5315 + }, + { + "epoch": 1.6135984216117771, + "grad_norm": 0.5146118998527527, + "learning_rate": 7.313961729270021e-05, + "loss": 1.5592, + "step": 5316 + }, + { + "epoch": 1.6139019578084688, + "grad_norm": 0.5074766874313354, + "learning_rate": 7.313455502683e-05, + "loss": 1.7058, + "step": 5317 + }, + { + "epoch": 1.61420549400516, + "grad_norm": 0.49822354316711426, + "learning_rate": 7.31294927609598e-05, + "loss": 1.7331, + "step": 5318 + }, + { + "epoch": 1.6145090302018517, + "grad_norm": 0.5845142006874084, + "learning_rate": 7.312443049508961e-05, + "loss": 1.9431, + "step": 5319 + }, + { + "epoch": 1.6148125663985429, + "grad_norm": 0.4772799611091614, + "learning_rate": 7.31193682292194e-05, + "loss": 2.0023, + "step": 5320 + }, + { + "epoch": 1.6151161025952345, + "grad_norm": 0.6034853458404541, + "learning_rate": 7.31143059633492e-05, + "loss": 1.7094, + "step": 5321 + }, + { + "epoch": 1.615419638791926, + "grad_norm": 0.5286986231803894, + "learning_rate": 7.310924369747899e-05, + "loss": 1.7875, + "step": 5322 + }, + { + "epoch": 1.6157231749886174, + "grad_norm": 0.6037927269935608, + "learning_rate": 7.310418143160879e-05, + "loss": 1.7475, + "step": 5323 + }, + { + "epoch": 1.6160267111853088, + "grad_norm": 7.824846267700195, + "learning_rate": 7.309911916573858e-05, + "loss": 1.6713, + "step": 5324 + }, + { + "epoch": 1.6163302473820003, + "grad_norm": 0.6112735271453857, + "learning_rate": 7.309405689986838e-05, + "loss": 1.7414, + "step": 5325 + }, + { + "epoch": 1.616633783578692, + "grad_norm": 0.5998720526695251, + "learning_rate": 7.308899463399817e-05, + "loss": 1.1521, + "step": 5326 + }, + { + "epoch": 1.6169373197753831, + "grad_norm": 0.5878032445907593, + "learning_rate": 7.308393236812798e-05, + "loss": 1.4747, + "step": 5327 + }, + { + "epoch": 1.6172408559720748, + "grad_norm": 0.5835018157958984, + "learning_rate": 7.307887010225778e-05, + "loss": 1.6374, + "step": 5328 + }, + { + "epoch": 1.617544392168766, + "grad_norm": 0.5299321413040161, + "learning_rate": 7.307380783638757e-05, + "loss": 1.8917, + "step": 5329 + }, + { + "epoch": 1.6178479283654577, + "grad_norm": 0.5514640808105469, + "learning_rate": 7.306874557051738e-05, + "loss": 1.6584, + "step": 5330 + }, + { + "epoch": 1.618151464562149, + "grad_norm": 0.44423699378967285, + "learning_rate": 7.306368330464717e-05, + "loss": 1.3953, + "step": 5331 + }, + { + "epoch": 1.6184550007588405, + "grad_norm": 0.45627906918525696, + "learning_rate": 7.305862103877697e-05, + "loss": 1.4091, + "step": 5332 + }, + { + "epoch": 1.618758536955532, + "grad_norm": 0.500055193901062, + "learning_rate": 7.305355877290676e-05, + "loss": 1.7118, + "step": 5333 + }, + { + "epoch": 1.6190620731522234, + "grad_norm": 0.5136251449584961, + "learning_rate": 7.304849650703656e-05, + "loss": 1.6012, + "step": 5334 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.48394328355789185, + "learning_rate": 7.304343424116635e-05, + "loss": 1.7937, + "step": 5335 + }, + { + "epoch": 1.6196691455456063, + "grad_norm": 0.4821557104587555, + "learning_rate": 7.303837197529615e-05, + "loss": 1.6532, + "step": 5336 + }, + { + "epoch": 1.619972681742298, + "grad_norm": 1.043676495552063, + "learning_rate": 7.303330970942594e-05, + "loss": 1.5878, + "step": 5337 + }, + { + "epoch": 1.6202762179389891, + "grad_norm": 0.4502454996109009, + "learning_rate": 7.302824744355574e-05, + "loss": 1.7892, + "step": 5338 + }, + { + "epoch": 1.6205797541356808, + "grad_norm": 0.605509340763092, + "learning_rate": 7.302318517768553e-05, + "loss": 2.0461, + "step": 5339 + }, + { + "epoch": 1.620883290332372, + "grad_norm": 0.5283710956573486, + "learning_rate": 7.301812291181534e-05, + "loss": 1.7673, + "step": 5340 + }, + { + "epoch": 1.6211868265290637, + "grad_norm": 0.5143932700157166, + "learning_rate": 7.301306064594513e-05, + "loss": 1.7829, + "step": 5341 + }, + { + "epoch": 1.621490362725755, + "grad_norm": 0.5089682936668396, + "learning_rate": 7.300799838007493e-05, + "loss": 1.8419, + "step": 5342 + }, + { + "epoch": 1.6217938989224465, + "grad_norm": 0.5280531644821167, + "learning_rate": 7.300293611420472e-05, + "loss": 1.1959, + "step": 5343 + }, + { + "epoch": 1.622097435119138, + "grad_norm": 0.45322325825691223, + "learning_rate": 7.299787384833452e-05, + "loss": 1.3736, + "step": 5344 + }, + { + "epoch": 1.6224009713158294, + "grad_norm": 0.39491701126098633, + "learning_rate": 7.299281158246431e-05, + "loss": 0.8163, + "step": 5345 + }, + { + "epoch": 1.6227045075125208, + "grad_norm": 0.6200827956199646, + "learning_rate": 7.298774931659411e-05, + "loss": 1.6614, + "step": 5346 + }, + { + "epoch": 1.6230080437092123, + "grad_norm": 0.4712357223033905, + "learning_rate": 7.29826870507239e-05, + "loss": 1.7717, + "step": 5347 + }, + { + "epoch": 1.623311579905904, + "grad_norm": 0.5763128995895386, + "learning_rate": 7.29776247848537e-05, + "loss": 1.7881, + "step": 5348 + }, + { + "epoch": 1.6236151161025951, + "grad_norm": 0.6178731322288513, + "learning_rate": 7.29725625189835e-05, + "loss": 1.5837, + "step": 5349 + }, + { + "epoch": 1.6239186522992868, + "grad_norm": 0.5563769936561584, + "learning_rate": 7.29675002531133e-05, + "loss": 1.54, + "step": 5350 + }, + { + "epoch": 1.624222188495978, + "grad_norm": 0.6363424062728882, + "learning_rate": 7.29624379872431e-05, + "loss": 1.4934, + "step": 5351 + }, + { + "epoch": 1.6245257246926696, + "grad_norm": 0.5647356510162354, + "learning_rate": 7.295737572137289e-05, + "loss": 1.3051, + "step": 5352 + }, + { + "epoch": 1.624829260889361, + "grad_norm": 3.164123296737671, + "learning_rate": 7.295231345550269e-05, + "loss": 2.0574, + "step": 5353 + }, + { + "epoch": 1.6251327970860525, + "grad_norm": 0.5882038474082947, + "learning_rate": 7.294725118963248e-05, + "loss": 1.8514, + "step": 5354 + }, + { + "epoch": 1.625436333282744, + "grad_norm": 0.4425477683544159, + "learning_rate": 7.294218892376228e-05, + "loss": 0.847, + "step": 5355 + }, + { + "epoch": 1.6257398694794354, + "grad_norm": 0.5342074036598206, + "learning_rate": 7.293712665789207e-05, + "loss": 1.3687, + "step": 5356 + }, + { + "epoch": 1.626043405676127, + "grad_norm": 0.5602349042892456, + "learning_rate": 7.293206439202186e-05, + "loss": 1.7487, + "step": 5357 + }, + { + "epoch": 1.6263469418728183, + "grad_norm": 0.5369912385940552, + "learning_rate": 7.292700212615167e-05, + "loss": 1.7015, + "step": 5358 + }, + { + "epoch": 1.62665047806951, + "grad_norm": 0.563209593296051, + "learning_rate": 7.292193986028147e-05, + "loss": 1.756, + "step": 5359 + }, + { + "epoch": 1.6269540142662011, + "grad_norm": 0.6118689179420471, + "learning_rate": 7.291687759441126e-05, + "loss": 1.6451, + "step": 5360 + }, + { + "epoch": 1.6272575504628928, + "grad_norm": 0.5531105399131775, + "learning_rate": 7.291181532854106e-05, + "loss": 0.894, + "step": 5361 + }, + { + "epoch": 1.6275610866595842, + "grad_norm": 0.5832155346870422, + "learning_rate": 7.290675306267085e-05, + "loss": 1.3583, + "step": 5362 + }, + { + "epoch": 1.6278646228562756, + "grad_norm": 0.7351096272468567, + "learning_rate": 7.290169079680065e-05, + "loss": 1.6225, + "step": 5363 + }, + { + "epoch": 1.628168159052967, + "grad_norm": 0.5954158902168274, + "learning_rate": 7.289662853093044e-05, + "loss": 1.711, + "step": 5364 + }, + { + "epoch": 1.6284716952496585, + "grad_norm": 0.5454484820365906, + "learning_rate": 7.289156626506024e-05, + "loss": 1.5543, + "step": 5365 + }, + { + "epoch": 1.62877523144635, + "grad_norm": 0.5965268015861511, + "learning_rate": 7.288650399919003e-05, + "loss": 1.706, + "step": 5366 + }, + { + "epoch": 1.6290787676430414, + "grad_norm": 0.6655979752540588, + "learning_rate": 7.288144173331983e-05, + "loss": 1.7892, + "step": 5367 + }, + { + "epoch": 1.629382303839733, + "grad_norm": 0.6440061926841736, + "learning_rate": 7.287637946744963e-05, + "loss": 1.8773, + "step": 5368 + }, + { + "epoch": 1.6296858400364242, + "grad_norm": 0.5195211172103882, + "learning_rate": 7.287131720157943e-05, + "loss": 1.7512, + "step": 5369 + }, + { + "epoch": 1.629989376233116, + "grad_norm": 0.5398062467575073, + "learning_rate": 7.286625493570922e-05, + "loss": 1.1974, + "step": 5370 + }, + { + "epoch": 1.6302929124298071, + "grad_norm": 0.5433503985404968, + "learning_rate": 7.286119266983902e-05, + "loss": 1.8253, + "step": 5371 + }, + { + "epoch": 1.6305964486264988, + "grad_norm": 0.4917634129524231, + "learning_rate": 7.285613040396883e-05, + "loss": 1.8575, + "step": 5372 + }, + { + "epoch": 1.6308999848231902, + "grad_norm": 0.6070329546928406, + "learning_rate": 7.285106813809862e-05, + "loss": 1.9349, + "step": 5373 + }, + { + "epoch": 1.6312035210198816, + "grad_norm": 0.5169711709022522, + "learning_rate": 7.284600587222842e-05, + "loss": 1.8008, + "step": 5374 + }, + { + "epoch": 1.631507057216573, + "grad_norm": 0.6040024757385254, + "learning_rate": 7.284094360635821e-05, + "loss": 1.7703, + "step": 5375 + }, + { + "epoch": 1.6318105934132645, + "grad_norm": 0.5740352272987366, + "learning_rate": 7.2835881340488e-05, + "loss": 1.8159, + "step": 5376 + }, + { + "epoch": 1.632114129609956, + "grad_norm": 0.5424453616142273, + "learning_rate": 7.28308190746178e-05, + "loss": 1.5593, + "step": 5377 + }, + { + "epoch": 1.6324176658066474, + "grad_norm": 0.5914655923843384, + "learning_rate": 7.28257568087476e-05, + "loss": 1.6786, + "step": 5378 + }, + { + "epoch": 1.632721202003339, + "grad_norm": 0.5969242453575134, + "learning_rate": 7.28206945428774e-05, + "loss": 1.7036, + "step": 5379 + }, + { + "epoch": 1.6330247382000302, + "grad_norm": 0.9573061466217041, + "learning_rate": 7.28156322770072e-05, + "loss": 1.2625, + "step": 5380 + }, + { + "epoch": 1.633328274396722, + "grad_norm": 0.4953905940055847, + "learning_rate": 7.2810570011137e-05, + "loss": 1.7095, + "step": 5381 + }, + { + "epoch": 1.6336318105934131, + "grad_norm": 0.48235177993774414, + "learning_rate": 7.280550774526679e-05, + "loss": 1.4157, + "step": 5382 + }, + { + "epoch": 1.6339353467901048, + "grad_norm": 0.5806698799133301, + "learning_rate": 7.280044547939658e-05, + "loss": 1.5457, + "step": 5383 + }, + { + "epoch": 1.6342388829867962, + "grad_norm": 0.48974600434303284, + "learning_rate": 7.279538321352638e-05, + "loss": 1.6838, + "step": 5384 + }, + { + "epoch": 1.6345424191834876, + "grad_norm": 0.5429823398590088, + "learning_rate": 7.279032094765617e-05, + "loss": 1.7311, + "step": 5385 + }, + { + "epoch": 1.634845955380179, + "grad_norm": 0.5275961756706238, + "learning_rate": 7.278525868178597e-05, + "loss": 1.6541, + "step": 5386 + }, + { + "epoch": 1.6351494915768705, + "grad_norm": 0.6194995045661926, + "learning_rate": 7.278019641591576e-05, + "loss": 1.5361, + "step": 5387 + }, + { + "epoch": 1.6354530277735622, + "grad_norm": 0.5362678170204163, + "learning_rate": 7.277513415004557e-05, + "loss": 1.8779, + "step": 5388 + }, + { + "epoch": 1.6357565639702534, + "grad_norm": 0.8009393811225891, + "learning_rate": 7.277007188417537e-05, + "loss": 1.7224, + "step": 5389 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.5585671067237854, + "learning_rate": 7.276500961830516e-05, + "loss": 1.8676, + "step": 5390 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.5586265921592712, + "learning_rate": 7.275994735243496e-05, + "loss": 1.6578, + "step": 5391 + }, + { + "epoch": 1.636667172560328, + "grad_norm": 0.5737308859825134, + "learning_rate": 7.275488508656475e-05, + "loss": 1.8705, + "step": 5392 + }, + { + "epoch": 1.636970708757019, + "grad_norm": 0.6445639729499817, + "learning_rate": 7.274982282069455e-05, + "loss": 1.65, + "step": 5393 + }, + { + "epoch": 1.6372742449537108, + "grad_norm": 0.5315992832183838, + "learning_rate": 7.274476055482434e-05, + "loss": 1.4896, + "step": 5394 + }, + { + "epoch": 1.6375777811504022, + "grad_norm": 0.5430747270584106, + "learning_rate": 7.273969828895413e-05, + "loss": 1.7276, + "step": 5395 + }, + { + "epoch": 1.6378813173470936, + "grad_norm": 0.4536675214767456, + "learning_rate": 7.273463602308393e-05, + "loss": 1.7787, + "step": 5396 + }, + { + "epoch": 1.638184853543785, + "grad_norm": 0.9887977242469788, + "learning_rate": 7.272957375721374e-05, + "loss": 1.2947, + "step": 5397 + }, + { + "epoch": 1.6384883897404765, + "grad_norm": 0.5427131056785583, + "learning_rate": 7.272451149134353e-05, + "loss": 2.032, + "step": 5398 + }, + { + "epoch": 1.6387919259371682, + "grad_norm": 0.5042452812194824, + "learning_rate": 7.271944922547333e-05, + "loss": 1.7509, + "step": 5399 + }, + { + "epoch": 1.6390954621338594, + "grad_norm": 0.5531298518180847, + "learning_rate": 7.271438695960312e-05, + "loss": 1.4943, + "step": 5400 + }, + { + "epoch": 1.639398998330551, + "grad_norm": 0.514651894569397, + "learning_rate": 7.270932469373292e-05, + "loss": 1.2271, + "step": 5401 + }, + { + "epoch": 1.6397025345272422, + "grad_norm": 0.5557711124420166, + "learning_rate": 7.270426242786271e-05, + "loss": 1.6918, + "step": 5402 + }, + { + "epoch": 1.640006070723934, + "grad_norm": 0.6158317923545837, + "learning_rate": 7.269920016199251e-05, + "loss": 1.7352, + "step": 5403 + }, + { + "epoch": 1.6403096069206253, + "grad_norm": 0.5777170658111572, + "learning_rate": 7.26941378961223e-05, + "loss": 1.8337, + "step": 5404 + }, + { + "epoch": 1.6406131431173168, + "grad_norm": 0.5811499357223511, + "learning_rate": 7.26890756302521e-05, + "loss": 1.668, + "step": 5405 + }, + { + "epoch": 1.6409166793140082, + "grad_norm": 0.5484623908996582, + "learning_rate": 7.268401336438189e-05, + "loss": 1.6774, + "step": 5406 + }, + { + "epoch": 1.6412202155106996, + "grad_norm": 0.6455094218254089, + "learning_rate": 7.26789510985117e-05, + "loss": 1.7618, + "step": 5407 + }, + { + "epoch": 1.641523751707391, + "grad_norm": 0.5346998572349548, + "learning_rate": 7.26738888326415e-05, + "loss": 1.4618, + "step": 5408 + }, + { + "epoch": 1.6418272879040825, + "grad_norm": 0.5332134962081909, + "learning_rate": 7.266882656677129e-05, + "loss": 1.8189, + "step": 5409 + }, + { + "epoch": 1.6421308241007742, + "grad_norm": 0.46270933747291565, + "learning_rate": 7.266376430090108e-05, + "loss": 1.7698, + "step": 5410 + }, + { + "epoch": 1.6424343602974654, + "grad_norm": 0.5155644416809082, + "learning_rate": 7.265870203503088e-05, + "loss": 1.8359, + "step": 5411 + }, + { + "epoch": 1.642737896494157, + "grad_norm": 0.539168119430542, + "learning_rate": 7.265363976916067e-05, + "loss": 1.7293, + "step": 5412 + }, + { + "epoch": 1.6430414326908482, + "grad_norm": 0.7102829813957214, + "learning_rate": 7.264857750329047e-05, + "loss": 1.3327, + "step": 5413 + }, + { + "epoch": 1.64334496888754, + "grad_norm": 0.5670261979103088, + "learning_rate": 7.264351523742026e-05, + "loss": 1.7286, + "step": 5414 + }, + { + "epoch": 1.6436485050842313, + "grad_norm": 0.5489922165870667, + "learning_rate": 7.263845297155006e-05, + "loss": 1.9019, + "step": 5415 + }, + { + "epoch": 1.6439520412809228, + "grad_norm": 0.7275190949440002, + "learning_rate": 7.263339070567987e-05, + "loss": 1.6061, + "step": 5416 + }, + { + "epoch": 1.6442555774776142, + "grad_norm": 0.6558725833892822, + "learning_rate": 7.262832843980966e-05, + "loss": 1.639, + "step": 5417 + }, + { + "epoch": 1.6445591136743056, + "grad_norm": 0.5907195806503296, + "learning_rate": 7.262326617393947e-05, + "loss": 1.935, + "step": 5418 + }, + { + "epoch": 1.644862649870997, + "grad_norm": 0.5562440156936646, + "learning_rate": 7.261820390806926e-05, + "loss": 1.6597, + "step": 5419 + }, + { + "epoch": 1.6451661860676885, + "grad_norm": 0.5652980804443359, + "learning_rate": 7.261314164219906e-05, + "loss": 1.7975, + "step": 5420 + }, + { + "epoch": 1.6454697222643802, + "grad_norm": 0.4403027296066284, + "learning_rate": 7.260807937632885e-05, + "loss": 1.2075, + "step": 5421 + }, + { + "epoch": 1.6457732584610714, + "grad_norm": 0.5183008909225464, + "learning_rate": 7.260301711045865e-05, + "loss": 1.669, + "step": 5422 + }, + { + "epoch": 1.646076794657763, + "grad_norm": 0.5930132865905762, + "learning_rate": 7.259795484458844e-05, + "loss": 1.9552, + "step": 5423 + }, + { + "epoch": 1.6463803308544542, + "grad_norm": 0.43935516476631165, + "learning_rate": 7.259289257871824e-05, + "loss": 1.7208, + "step": 5424 + }, + { + "epoch": 1.6466838670511459, + "grad_norm": 0.5676711797714233, + "learning_rate": 7.258783031284803e-05, + "loss": 1.397, + "step": 5425 + }, + { + "epoch": 1.6469874032478373, + "grad_norm": 0.5350246429443359, + "learning_rate": 7.258276804697783e-05, + "loss": 1.8004, + "step": 5426 + }, + { + "epoch": 1.6472909394445288, + "grad_norm": 0.4808463156223297, + "learning_rate": 7.257770578110764e-05, + "loss": 1.61, + "step": 5427 + }, + { + "epoch": 1.6475944756412202, + "grad_norm": 0.49013078212738037, + "learning_rate": 7.257264351523743e-05, + "loss": 1.2447, + "step": 5428 + }, + { + "epoch": 1.6478980118379116, + "grad_norm": 0.5129517912864685, + "learning_rate": 7.256758124936723e-05, + "loss": 1.5529, + "step": 5429 + }, + { + "epoch": 1.6482015480346033, + "grad_norm": 0.5757639408111572, + "learning_rate": 7.256251898349702e-05, + "loss": 1.2341, + "step": 5430 + }, + { + "epoch": 1.6485050842312945, + "grad_norm": 0.5084811449050903, + "learning_rate": 7.255745671762682e-05, + "loss": 1.8156, + "step": 5431 + }, + { + "epoch": 1.6488086204279861, + "grad_norm": 0.5663341283798218, + "learning_rate": 7.255239445175661e-05, + "loss": 1.431, + "step": 5432 + }, + { + "epoch": 1.6491121566246774, + "grad_norm": 0.6115935444831848, + "learning_rate": 7.25473321858864e-05, + "loss": 1.1595, + "step": 5433 + }, + { + "epoch": 1.649415692821369, + "grad_norm": 0.4670819342136383, + "learning_rate": 7.25422699200162e-05, + "loss": 1.658, + "step": 5434 + }, + { + "epoch": 1.6497192290180605, + "grad_norm": 0.5507335066795349, + "learning_rate": 7.2537207654146e-05, + "loss": 1.7194, + "step": 5435 + }, + { + "epoch": 1.6500227652147519, + "grad_norm": 0.5227102637290955, + "learning_rate": 7.25321453882758e-05, + "loss": 1.7898, + "step": 5436 + }, + { + "epoch": 1.6503263014114433, + "grad_norm": 0.5725428462028503, + "learning_rate": 7.25270831224056e-05, + "loss": 1.3801, + "step": 5437 + }, + { + "epoch": 1.6506298376081348, + "grad_norm": 0.6102313995361328, + "learning_rate": 7.252202085653539e-05, + "loss": 1.6622, + "step": 5438 + }, + { + "epoch": 1.6509333738048262, + "grad_norm": 0.5922355651855469, + "learning_rate": 7.251695859066519e-05, + "loss": 1.6763, + "step": 5439 + }, + { + "epoch": 1.6512369100015176, + "grad_norm": 0.8734421133995056, + "learning_rate": 7.251189632479498e-05, + "loss": 1.4523, + "step": 5440 + }, + { + "epoch": 1.6515404461982093, + "grad_norm": 0.6614634394645691, + "learning_rate": 7.250683405892478e-05, + "loss": 1.7902, + "step": 5441 + }, + { + "epoch": 1.6518439823949005, + "grad_norm": 0.5623548030853271, + "learning_rate": 7.250177179305457e-05, + "loss": 1.7837, + "step": 5442 + }, + { + "epoch": 1.6521475185915921, + "grad_norm": 0.5432645082473755, + "learning_rate": 7.249670952718437e-05, + "loss": 1.4503, + "step": 5443 + }, + { + "epoch": 1.6524510547882834, + "grad_norm": 0.5051206350326538, + "learning_rate": 7.249164726131416e-05, + "loss": 1.9885, + "step": 5444 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.5985874533653259, + "learning_rate": 7.248658499544396e-05, + "loss": 1.8564, + "step": 5445 + }, + { + "epoch": 1.6530581271816664, + "grad_norm": 0.722251832485199, + "learning_rate": 7.248152272957376e-05, + "loss": 1.4029, + "step": 5446 + }, + { + "epoch": 1.6533616633783579, + "grad_norm": 0.5419872999191284, + "learning_rate": 7.247646046370356e-05, + "loss": 2.0266, + "step": 5447 + }, + { + "epoch": 1.6536651995750493, + "grad_norm": 0.5634152889251709, + "learning_rate": 7.247139819783335e-05, + "loss": 1.4273, + "step": 5448 + }, + { + "epoch": 1.6539687357717408, + "grad_norm": 0.6193149089813232, + "learning_rate": 7.246633593196315e-05, + "loss": 1.437, + "step": 5449 + }, + { + "epoch": 1.6542722719684322, + "grad_norm": 0.701909065246582, + "learning_rate": 7.246127366609294e-05, + "loss": 1.484, + "step": 5450 + }, + { + "epoch": 1.6545758081651236, + "grad_norm": 0.5948097109794617, + "learning_rate": 7.245621140022274e-05, + "loss": 1.1156, + "step": 5451 + }, + { + "epoch": 1.6548793443618153, + "grad_norm": 0.6138877272605896, + "learning_rate": 7.245114913435253e-05, + "loss": 1.2544, + "step": 5452 + }, + { + "epoch": 1.6551828805585065, + "grad_norm": 0.5474926829338074, + "learning_rate": 7.244608686848233e-05, + "loss": 1.6751, + "step": 5453 + }, + { + "epoch": 1.6554864167551981, + "grad_norm": 0.5998562574386597, + "learning_rate": 7.244102460261212e-05, + "loss": 1.8192, + "step": 5454 + }, + { + "epoch": 1.6557899529518894, + "grad_norm": 0.6224400997161865, + "learning_rate": 7.243596233674193e-05, + "loss": 1.819, + "step": 5455 + }, + { + "epoch": 1.656093489148581, + "grad_norm": 0.6007357239723206, + "learning_rate": 7.243090007087173e-05, + "loss": 1.7359, + "step": 5456 + }, + { + "epoch": 1.6563970253452724, + "grad_norm": 0.5641934871673584, + "learning_rate": 7.242583780500152e-05, + "loss": 1.2001, + "step": 5457 + }, + { + "epoch": 1.6567005615419639, + "grad_norm": 0.5024848580360413, + "learning_rate": 7.242077553913132e-05, + "loss": 1.8584, + "step": 5458 + }, + { + "epoch": 1.6570040977386553, + "grad_norm": 0.3787592649459839, + "learning_rate": 7.241571327326111e-05, + "loss": 1.5685, + "step": 5459 + }, + { + "epoch": 1.6573076339353467, + "grad_norm": 0.3638264536857605, + "learning_rate": 7.24106510073909e-05, + "loss": 1.8184, + "step": 5460 + }, + { + "epoch": 1.6576111701320384, + "grad_norm": 0.5221641659736633, + "learning_rate": 7.240558874152071e-05, + "loss": 1.7377, + "step": 5461 + }, + { + "epoch": 1.6579147063287296, + "grad_norm": 0.4744473993778229, + "learning_rate": 7.240052647565051e-05, + "loss": 1.9189, + "step": 5462 + }, + { + "epoch": 1.6582182425254213, + "grad_norm": 0.48213768005371094, + "learning_rate": 7.23954642097803e-05, + "loss": 1.8864, + "step": 5463 + }, + { + "epoch": 1.6585217787221125, + "grad_norm": 0.5562633872032166, + "learning_rate": 7.23904019439101e-05, + "loss": 1.3022, + "step": 5464 + }, + { + "epoch": 1.6588253149188041, + "grad_norm": 0.5425136685371399, + "learning_rate": 7.238533967803989e-05, + "loss": 1.9652, + "step": 5465 + }, + { + "epoch": 1.6591288511154956, + "grad_norm": 0.6003391146659851, + "learning_rate": 7.23802774121697e-05, + "loss": 2.0183, + "step": 5466 + }, + { + "epoch": 1.659432387312187, + "grad_norm": 0.6612154841423035, + "learning_rate": 7.23752151462995e-05, + "loss": 1.437, + "step": 5467 + }, + { + "epoch": 1.6597359235088784, + "grad_norm": 0.6031103730201721, + "learning_rate": 7.237015288042929e-05, + "loss": 1.543, + "step": 5468 + }, + { + "epoch": 1.6600394597055699, + "grad_norm": 0.5282215476036072, + "learning_rate": 7.236509061455909e-05, + "loss": 1.7141, + "step": 5469 + }, + { + "epoch": 1.6603429959022613, + "grad_norm": 1.0144010782241821, + "learning_rate": 7.236002834868888e-05, + "loss": 1.4512, + "step": 5470 + }, + { + "epoch": 1.6606465320989527, + "grad_norm": 0.5801889896392822, + "learning_rate": 7.235496608281867e-05, + "loss": 1.5097, + "step": 5471 + }, + { + "epoch": 1.6609500682956444, + "grad_norm": 1.2839747667312622, + "learning_rate": 7.234990381694847e-05, + "loss": 1.4121, + "step": 5472 + }, + { + "epoch": 1.6612536044923356, + "grad_norm": 0.6611791253089905, + "learning_rate": 7.234484155107826e-05, + "loss": 1.1927, + "step": 5473 + }, + { + "epoch": 1.6615571406890273, + "grad_norm": 0.6028724908828735, + "learning_rate": 7.233977928520806e-05, + "loss": 1.615, + "step": 5474 + }, + { + "epoch": 1.6618606768857185, + "grad_norm": 0.5648754835128784, + "learning_rate": 7.233471701933787e-05, + "loss": 1.2785, + "step": 5475 + }, + { + "epoch": 1.6621642130824101, + "grad_norm": 0.6196275353431702, + "learning_rate": 7.232965475346766e-05, + "loss": 1.6909, + "step": 5476 + }, + { + "epoch": 1.6624677492791016, + "grad_norm": 0.6454710960388184, + "learning_rate": 7.232459248759746e-05, + "loss": 1.6158, + "step": 5477 + }, + { + "epoch": 1.662771285475793, + "grad_norm": 0.6051695942878723, + "learning_rate": 7.231953022172725e-05, + "loss": 1.4259, + "step": 5478 + }, + { + "epoch": 1.6630748216724844, + "grad_norm": 0.9160197377204895, + "learning_rate": 7.231446795585705e-05, + "loss": 1.3115, + "step": 5479 + }, + { + "epoch": 1.6633783578691759, + "grad_norm": 0.5905101299285889, + "learning_rate": 7.230940568998684e-05, + "loss": 1.5463, + "step": 5480 + }, + { + "epoch": 1.6636818940658673, + "grad_norm": 0.48909255862236023, + "learning_rate": 7.230434342411664e-05, + "loss": 1.7501, + "step": 5481 + }, + { + "epoch": 1.6639854302625587, + "grad_norm": 0.4441916048526764, + "learning_rate": 7.229928115824643e-05, + "loss": 1.8818, + "step": 5482 + }, + { + "epoch": 1.6642889664592504, + "grad_norm": 0.45585280656814575, + "learning_rate": 7.229421889237623e-05, + "loss": 1.8129, + "step": 5483 + }, + { + "epoch": 1.6645925026559416, + "grad_norm": 0.5730534195899963, + "learning_rate": 7.228915662650602e-05, + "loss": 1.2908, + "step": 5484 + }, + { + "epoch": 1.6648960388526333, + "grad_norm": 0.48153162002563477, + "learning_rate": 7.228409436063583e-05, + "loss": 1.6484, + "step": 5485 + }, + { + "epoch": 1.6651995750493245, + "grad_norm": 0.5049116015434265, + "learning_rate": 7.227903209476562e-05, + "loss": 1.7223, + "step": 5486 + }, + { + "epoch": 1.6655031112460161, + "grad_norm": 0.43153953552246094, + "learning_rate": 7.227396982889542e-05, + "loss": 1.6567, + "step": 5487 + }, + { + "epoch": 1.6658066474427076, + "grad_norm": 0.4768941104412079, + "learning_rate": 7.226890756302521e-05, + "loss": 1.7213, + "step": 5488 + }, + { + "epoch": 1.666110183639399, + "grad_norm": 0.39153575897216797, + "learning_rate": 7.226384529715501e-05, + "loss": 1.2501, + "step": 5489 + }, + { + "epoch": 1.6664137198360904, + "grad_norm": 0.4094899594783783, + "learning_rate": 7.22587830312848e-05, + "loss": 1.0887, + "step": 5490 + }, + { + "epoch": 1.6667172560327819, + "grad_norm": 0.5269747972488403, + "learning_rate": 7.22537207654146e-05, + "loss": 1.3502, + "step": 5491 + }, + { + "epoch": 1.6670207922294735, + "grad_norm": 0.4670669138431549, + "learning_rate": 7.224865849954439e-05, + "loss": 1.6927, + "step": 5492 + }, + { + "epoch": 1.6673243284261647, + "grad_norm": 0.4470636546611786, + "learning_rate": 7.224359623367419e-05, + "loss": 1.6558, + "step": 5493 + }, + { + "epoch": 1.6676278646228564, + "grad_norm": 0.5684782266616821, + "learning_rate": 7.2238533967804e-05, + "loss": 2.0172, + "step": 5494 + }, + { + "epoch": 1.6679314008195476, + "grad_norm": 0.5105867981910706, + "learning_rate": 7.223347170193379e-05, + "loss": 1.4969, + "step": 5495 + }, + { + "epoch": 1.6682349370162393, + "grad_norm": 0.4778842628002167, + "learning_rate": 7.222840943606359e-05, + "loss": 1.9251, + "step": 5496 + }, + { + "epoch": 1.6685384732129307, + "grad_norm": 0.5652633905410767, + "learning_rate": 7.222334717019338e-05, + "loss": 2.0057, + "step": 5497 + }, + { + "epoch": 1.6688420094096221, + "grad_norm": 0.5117583870887756, + "learning_rate": 7.221828490432317e-05, + "loss": 1.7944, + "step": 5498 + }, + { + "epoch": 1.6691455456063136, + "grad_norm": 0.5227757096290588, + "learning_rate": 7.221322263845297e-05, + "loss": 1.8122, + "step": 5499 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 1.003037691116333, + "learning_rate": 7.220816037258276e-05, + "loss": 1.8023, + "step": 5500 + }, + { + "epoch": 1.6697526179996964, + "grad_norm": 0.5822034478187561, + "learning_rate": 7.220309810671256e-05, + "loss": 1.8406, + "step": 5501 + }, + { + "epoch": 1.6700561541963879, + "grad_norm": 0.5243530869483948, + "learning_rate": 7.219803584084235e-05, + "loss": 1.3146, + "step": 5502 + }, + { + "epoch": 1.6703596903930795, + "grad_norm": 0.6547448635101318, + "learning_rate": 7.219297357497216e-05, + "loss": 1.9529, + "step": 5503 + }, + { + "epoch": 1.6706632265897707, + "grad_norm": 0.5289431810379028, + "learning_rate": 7.218791130910196e-05, + "loss": 1.8209, + "step": 5504 + }, + { + "epoch": 1.6709667627864624, + "grad_norm": 0.47597306966781616, + "learning_rate": 7.218284904323177e-05, + "loss": 1.7394, + "step": 5505 + }, + { + "epoch": 1.6712702989831536, + "grad_norm": 0.6972471475601196, + "learning_rate": 7.217778677736156e-05, + "loss": 1.3056, + "step": 5506 + }, + { + "epoch": 1.6715738351798453, + "grad_norm": 0.6936922669410706, + "learning_rate": 7.217272451149136e-05, + "loss": 1.311, + "step": 5507 + }, + { + "epoch": 1.6718773713765367, + "grad_norm": 0.613391637802124, + "learning_rate": 7.216766224562115e-05, + "loss": 1.6611, + "step": 5508 + }, + { + "epoch": 1.6721809075732281, + "grad_norm": 0.5067817568778992, + "learning_rate": 7.216259997975094e-05, + "loss": 1.534, + "step": 5509 + }, + { + "epoch": 1.6724844437699196, + "grad_norm": 0.8583042025566101, + "learning_rate": 7.215753771388074e-05, + "loss": 1.2529, + "step": 5510 + }, + { + "epoch": 1.672787979966611, + "grad_norm": 0.6113730072975159, + "learning_rate": 7.215247544801053e-05, + "loss": 1.3955, + "step": 5511 + }, + { + "epoch": 1.6730915161633024, + "grad_norm": 0.5480425357818604, + "learning_rate": 7.214741318214033e-05, + "loss": 1.6759, + "step": 5512 + }, + { + "epoch": 1.6733950523599939, + "grad_norm": 0.5304676294326782, + "learning_rate": 7.214235091627012e-05, + "loss": 1.7908, + "step": 5513 + }, + { + "epoch": 1.6736985885566855, + "grad_norm": 0.5603669285774231, + "learning_rate": 7.213728865039993e-05, + "loss": 1.6092, + "step": 5514 + }, + { + "epoch": 1.6740021247533767, + "grad_norm": 0.5751498937606812, + "learning_rate": 7.213222638452973e-05, + "loss": 1.3722, + "step": 5515 + }, + { + "epoch": 1.6743056609500684, + "grad_norm": 0.5078593492507935, + "learning_rate": 7.212716411865952e-05, + "loss": 1.206, + "step": 5516 + }, + { + "epoch": 1.6746091971467596, + "grad_norm": 0.5524204969406128, + "learning_rate": 7.212210185278932e-05, + "loss": 1.8564, + "step": 5517 + }, + { + "epoch": 1.6749127333434513, + "grad_norm": 0.5641534328460693, + "learning_rate": 7.211703958691911e-05, + "loss": 1.6019, + "step": 5518 + }, + { + "epoch": 1.6752162695401427, + "grad_norm": 0.4548323452472687, + "learning_rate": 7.21119773210489e-05, + "loss": 2.1299, + "step": 5519 + }, + { + "epoch": 1.6755198057368341, + "grad_norm": 0.4769175052642822, + "learning_rate": 7.21069150551787e-05, + "loss": 1.5924, + "step": 5520 + }, + { + "epoch": 1.6758233419335256, + "grad_norm": 0.4972521960735321, + "learning_rate": 7.21018527893085e-05, + "loss": 1.5578, + "step": 5521 + }, + { + "epoch": 1.676126878130217, + "grad_norm": 0.5828529000282288, + "learning_rate": 7.209679052343829e-05, + "loss": 1.5206, + "step": 5522 + }, + { + "epoch": 1.6764304143269086, + "grad_norm": 0.535059928894043, + "learning_rate": 7.209172825756809e-05, + "loss": 1.6381, + "step": 5523 + }, + { + "epoch": 1.6767339505235999, + "grad_norm": 0.8154981136322021, + "learning_rate": 7.20866659916979e-05, + "loss": 1.498, + "step": 5524 + }, + { + "epoch": 1.6770374867202915, + "grad_norm": 0.5557091236114502, + "learning_rate": 7.208160372582769e-05, + "loss": 1.7889, + "step": 5525 + }, + { + "epoch": 1.6773410229169827, + "grad_norm": 0.5542075634002686, + "learning_rate": 7.207654145995748e-05, + "loss": 1.758, + "step": 5526 + }, + { + "epoch": 1.6776445591136744, + "grad_norm": 0.46830353140830994, + "learning_rate": 7.207147919408728e-05, + "loss": 1.9262, + "step": 5527 + }, + { + "epoch": 1.6779480953103656, + "grad_norm": 0.5317898988723755, + "learning_rate": 7.206641692821707e-05, + "loss": 1.4244, + "step": 5528 + }, + { + "epoch": 1.6782516315070573, + "grad_norm": 0.4785618185997009, + "learning_rate": 7.206135466234687e-05, + "loss": 1.2493, + "step": 5529 + }, + { + "epoch": 1.6785551677037487, + "grad_norm": 0.5754481554031372, + "learning_rate": 7.205629239647666e-05, + "loss": 1.9019, + "step": 5530 + }, + { + "epoch": 1.6788587039004401, + "grad_norm": 0.5696677565574646, + "learning_rate": 7.205123013060646e-05, + "loss": 1.9587, + "step": 5531 + }, + { + "epoch": 1.6791622400971316, + "grad_norm": 0.46303999423980713, + "learning_rate": 7.204616786473625e-05, + "loss": 1.352, + "step": 5532 + }, + { + "epoch": 1.679465776293823, + "grad_norm": 0.5733750462532043, + "learning_rate": 7.204110559886606e-05, + "loss": 1.442, + "step": 5533 + }, + { + "epoch": 1.6797693124905146, + "grad_norm": 0.5868303775787354, + "learning_rate": 7.203604333299586e-05, + "loss": 1.8534, + "step": 5534 + }, + { + "epoch": 1.6800728486872059, + "grad_norm": 0.532902717590332, + "learning_rate": 7.203098106712565e-05, + "loss": 1.748, + "step": 5535 + }, + { + "epoch": 1.6803763848838975, + "grad_norm": 0.6015176773071289, + "learning_rate": 7.202591880125544e-05, + "loss": 1.7901, + "step": 5536 + }, + { + "epoch": 1.6806799210805887, + "grad_norm": 0.880872905254364, + "learning_rate": 7.202085653538524e-05, + "loss": 1.5535, + "step": 5537 + }, + { + "epoch": 1.6809834572772804, + "grad_norm": 0.515289843082428, + "learning_rate": 7.201579426951503e-05, + "loss": 1.8207, + "step": 5538 + }, + { + "epoch": 1.6812869934739718, + "grad_norm": 0.9315522313117981, + "learning_rate": 7.201073200364483e-05, + "loss": 1.9596, + "step": 5539 + }, + { + "epoch": 1.6815905296706632, + "grad_norm": 0.700817883014679, + "learning_rate": 7.200566973777462e-05, + "loss": 1.8407, + "step": 5540 + }, + { + "epoch": 1.6818940658673547, + "grad_norm": 0.5674588084220886, + "learning_rate": 7.200060747190442e-05, + "loss": 1.6263, + "step": 5541 + }, + { + "epoch": 1.6821976020640461, + "grad_norm": 0.599992573261261, + "learning_rate": 7.199554520603423e-05, + "loss": 1.8701, + "step": 5542 + }, + { + "epoch": 1.6825011382607375, + "grad_norm": 0.49618181586265564, + "learning_rate": 7.199048294016402e-05, + "loss": 1.6515, + "step": 5543 + }, + { + "epoch": 1.682804674457429, + "grad_norm": 0.5886253714561462, + "learning_rate": 7.198542067429382e-05, + "loss": 1.5716, + "step": 5544 + }, + { + "epoch": 1.6831082106541206, + "grad_norm": 0.5282119512557983, + "learning_rate": 7.198035840842361e-05, + "loss": 1.7686, + "step": 5545 + }, + { + "epoch": 1.6834117468508119, + "grad_norm": 0.5557923913002014, + "learning_rate": 7.19752961425534e-05, + "loss": 1.7876, + "step": 5546 + }, + { + "epoch": 1.6837152830475035, + "grad_norm": 0.5981353521347046, + "learning_rate": 7.19702338766832e-05, + "loss": 1.4325, + "step": 5547 + }, + { + "epoch": 1.6840188192441947, + "grad_norm": 0.5523660182952881, + "learning_rate": 7.1965171610813e-05, + "loss": 1.6176, + "step": 5548 + }, + { + "epoch": 1.6843223554408864, + "grad_norm": 0.886365532875061, + "learning_rate": 7.196010934494279e-05, + "loss": 1.5195, + "step": 5549 + }, + { + "epoch": 1.6846258916375778, + "grad_norm": 0.5468156337738037, + "learning_rate": 7.19550470790726e-05, + "loss": 1.6558, + "step": 5550 + }, + { + "epoch": 1.6849294278342692, + "grad_norm": 0.682375967502594, + "learning_rate": 7.19499848132024e-05, + "loss": 1.7567, + "step": 5551 + }, + { + "epoch": 1.6852329640309607, + "grad_norm": 0.5643876791000366, + "learning_rate": 7.194492254733219e-05, + "loss": 1.9613, + "step": 5552 + }, + { + "epoch": 1.6855365002276521, + "grad_norm": 0.5802308917045593, + "learning_rate": 7.1939860281462e-05, + "loss": 1.7519, + "step": 5553 + }, + { + "epoch": 1.6858400364243438, + "grad_norm": 0.5967023968696594, + "learning_rate": 7.193479801559179e-05, + "loss": 1.024, + "step": 5554 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.5764415860176086, + "learning_rate": 7.192973574972159e-05, + "loss": 1.2554, + "step": 5555 + }, + { + "epoch": 1.6864471088177266, + "grad_norm": 0.6134281754493713, + "learning_rate": 7.192467348385138e-05, + "loss": 1.884, + "step": 5556 + }, + { + "epoch": 1.6867506450144178, + "grad_norm": 0.4594460427761078, + "learning_rate": 7.191961121798118e-05, + "loss": 1.9008, + "step": 5557 + }, + { + "epoch": 1.6870541812111095, + "grad_norm": 0.5448347330093384, + "learning_rate": 7.191454895211097e-05, + "loss": 1.1015, + "step": 5558 + }, + { + "epoch": 1.6873577174078007, + "grad_norm": 0.4324944317340851, + "learning_rate": 7.190948668624077e-05, + "loss": 0.9942, + "step": 5559 + }, + { + "epoch": 1.6876612536044924, + "grad_norm": 0.5763707160949707, + "learning_rate": 7.190442442037056e-05, + "loss": 1.428, + "step": 5560 + }, + { + "epoch": 1.6879647898011838, + "grad_norm": 0.5779836773872375, + "learning_rate": 7.189936215450036e-05, + "loss": 1.7867, + "step": 5561 + }, + { + "epoch": 1.6882683259978752, + "grad_norm": 0.6972694993019104, + "learning_rate": 7.189429988863015e-05, + "loss": 1.7712, + "step": 5562 + }, + { + "epoch": 1.6885718621945667, + "grad_norm": 1.026524305343628, + "learning_rate": 7.188923762275996e-05, + "loss": 1.4089, + "step": 5563 + }, + { + "epoch": 1.688875398391258, + "grad_norm": 0.5667091608047485, + "learning_rate": 7.188417535688975e-05, + "loss": 2.1362, + "step": 5564 + }, + { + "epoch": 1.6891789345879498, + "grad_norm": 0.48868703842163086, + "learning_rate": 7.187911309101955e-05, + "loss": 1.493, + "step": 5565 + }, + { + "epoch": 1.689482470784641, + "grad_norm": 0.43740716576576233, + "learning_rate": 7.187405082514934e-05, + "loss": 1.3792, + "step": 5566 + }, + { + "epoch": 1.6897860069813326, + "grad_norm": 0.49257686734199524, + "learning_rate": 7.186898855927914e-05, + "loss": 2.3111, + "step": 5567 + }, + { + "epoch": 1.6900895431780238, + "grad_norm": 0.6309003233909607, + "learning_rate": 7.186392629340893e-05, + "loss": 1.7269, + "step": 5568 + }, + { + "epoch": 1.6903930793747155, + "grad_norm": 0.5394817590713501, + "learning_rate": 7.185886402753873e-05, + "loss": 1.35, + "step": 5569 + }, + { + "epoch": 1.690696615571407, + "grad_norm": 0.5133473873138428, + "learning_rate": 7.185380176166852e-05, + "loss": 1.1287, + "step": 5570 + }, + { + "epoch": 1.6910001517680984, + "grad_norm": 0.5639081597328186, + "learning_rate": 7.184873949579832e-05, + "loss": 1.3801, + "step": 5571 + }, + { + "epoch": 1.6913036879647898, + "grad_norm": 0.5387223362922668, + "learning_rate": 7.184367722992813e-05, + "loss": 1.5725, + "step": 5572 + }, + { + "epoch": 1.6916072241614812, + "grad_norm": 0.4873654246330261, + "learning_rate": 7.183861496405792e-05, + "loss": 1.8034, + "step": 5573 + }, + { + "epoch": 1.6919107603581727, + "grad_norm": 0.5473541617393494, + "learning_rate": 7.183355269818771e-05, + "loss": 1.6244, + "step": 5574 + }, + { + "epoch": 1.692214296554864, + "grad_norm": 0.5153944492340088, + "learning_rate": 7.182849043231751e-05, + "loss": 1.8091, + "step": 5575 + }, + { + "epoch": 1.6925178327515558, + "grad_norm": 0.5295174717903137, + "learning_rate": 7.18234281664473e-05, + "loss": 1.8437, + "step": 5576 + }, + { + "epoch": 1.692821368948247, + "grad_norm": 0.46947070956230164, + "learning_rate": 7.18183659005771e-05, + "loss": 1.0215, + "step": 5577 + }, + { + "epoch": 1.6931249051449386, + "grad_norm": 0.5614966154098511, + "learning_rate": 7.18133036347069e-05, + "loss": 1.645, + "step": 5578 + }, + { + "epoch": 1.6934284413416298, + "grad_norm": 0.5633820295333862, + "learning_rate": 7.180824136883669e-05, + "loss": 1.8471, + "step": 5579 + }, + { + "epoch": 1.6937319775383215, + "grad_norm": 0.5536201000213623, + "learning_rate": 7.180317910296648e-05, + "loss": 1.539, + "step": 5580 + }, + { + "epoch": 1.694035513735013, + "grad_norm": 0.5490652322769165, + "learning_rate": 7.179811683709629e-05, + "loss": 1.2846, + "step": 5581 + }, + { + "epoch": 1.6943390499317044, + "grad_norm": 0.47489118576049805, + "learning_rate": 7.179305457122609e-05, + "loss": 1.6611, + "step": 5582 + }, + { + "epoch": 1.6946425861283958, + "grad_norm": 0.5121912956237793, + "learning_rate": 7.178799230535588e-05, + "loss": 1.8077, + "step": 5583 + }, + { + "epoch": 1.6949461223250872, + "grad_norm": 0.5762444138526917, + "learning_rate": 7.178293003948568e-05, + "loss": 1.6236, + "step": 5584 + }, + { + "epoch": 1.6952496585217787, + "grad_norm": 0.5737335085868835, + "learning_rate": 7.177786777361547e-05, + "loss": 1.6909, + "step": 5585 + }, + { + "epoch": 1.69555319471847, + "grad_norm": 0.573403537273407, + "learning_rate": 7.177280550774527e-05, + "loss": 1.4276, + "step": 5586 + }, + { + "epoch": 1.6958567309151618, + "grad_norm": 0.6170399188995361, + "learning_rate": 7.176774324187506e-05, + "loss": 1.5555, + "step": 5587 + }, + { + "epoch": 1.696160267111853, + "grad_norm": 0.5213575959205627, + "learning_rate": 7.176268097600486e-05, + "loss": 1.7751, + "step": 5588 + }, + { + "epoch": 1.6964638033085446, + "grad_norm": 0.501232385635376, + "learning_rate": 7.175761871013465e-05, + "loss": 1.5369, + "step": 5589 + }, + { + "epoch": 1.6967673395052358, + "grad_norm": 0.6428576707839966, + "learning_rate": 7.175255644426445e-05, + "loss": 1.349, + "step": 5590 + }, + { + "epoch": 1.6970708757019275, + "grad_norm": 0.5654997825622559, + "learning_rate": 7.174749417839425e-05, + "loss": 1.6381, + "step": 5591 + }, + { + "epoch": 1.697374411898619, + "grad_norm": 0.5747986435890198, + "learning_rate": 7.174243191252405e-05, + "loss": 1.6836, + "step": 5592 + }, + { + "epoch": 1.6976779480953104, + "grad_norm": 0.48611271381378174, + "learning_rate": 7.173736964665384e-05, + "loss": 1.8771, + "step": 5593 + }, + { + "epoch": 1.6979814842920018, + "grad_norm": 0.6112745404243469, + "learning_rate": 7.173230738078365e-05, + "loss": 1.4543, + "step": 5594 + }, + { + "epoch": 1.6982850204886932, + "grad_norm": 0.5568521022796631, + "learning_rate": 7.172724511491345e-05, + "loss": 1.2674, + "step": 5595 + }, + { + "epoch": 1.6985885566853849, + "grad_norm": 0.9814030528068542, + "learning_rate": 7.172218284904324e-05, + "loss": 1.418, + "step": 5596 + }, + { + "epoch": 1.698892092882076, + "grad_norm": 0.5787724852561951, + "learning_rate": 7.171712058317304e-05, + "loss": 1.8167, + "step": 5597 + }, + { + "epoch": 1.6991956290787678, + "grad_norm": 0.527346670627594, + "learning_rate": 7.171205831730283e-05, + "loss": 1.9819, + "step": 5598 + }, + { + "epoch": 1.699499165275459, + "grad_norm": 0.5274780988693237, + "learning_rate": 7.170699605143263e-05, + "loss": 2.0423, + "step": 5599 + }, + { + "epoch": 1.6998027014721506, + "grad_norm": 0.6172202825546265, + "learning_rate": 7.170193378556242e-05, + "loss": 1.5017, + "step": 5600 + }, + { + "epoch": 1.700106237668842, + "grad_norm": 0.6280341148376465, + "learning_rate": 7.169687151969221e-05, + "loss": 1.8298, + "step": 5601 + }, + { + "epoch": 1.7004097738655335, + "grad_norm": 0.5946933627128601, + "learning_rate": 7.169180925382202e-05, + "loss": 1.5956, + "step": 5602 + }, + { + "epoch": 1.700713310062225, + "grad_norm": 0.5152033567428589, + "learning_rate": 7.168674698795182e-05, + "loss": 1.9501, + "step": 5603 + }, + { + "epoch": 1.7010168462589164, + "grad_norm": 0.7176896333694458, + "learning_rate": 7.168168472208161e-05, + "loss": 1.8614, + "step": 5604 + }, + { + "epoch": 1.7013203824556078, + "grad_norm": 0.5222963690757751, + "learning_rate": 7.167662245621141e-05, + "loss": 1.8237, + "step": 5605 + }, + { + "epoch": 1.7016239186522992, + "grad_norm": 0.7052047848701477, + "learning_rate": 7.16715601903412e-05, + "loss": 1.4014, + "step": 5606 + }, + { + "epoch": 1.7019274548489909, + "grad_norm": 0.5208356380462646, + "learning_rate": 7.1666497924471e-05, + "loss": 1.4113, + "step": 5607 + }, + { + "epoch": 1.702230991045682, + "grad_norm": 0.6521908044815063, + "learning_rate": 7.166143565860079e-05, + "loss": 1.6173, + "step": 5608 + }, + { + "epoch": 1.7025345272423738, + "grad_norm": 0.5605899691581726, + "learning_rate": 7.165637339273059e-05, + "loss": 1.4772, + "step": 5609 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.45523953437805176, + "learning_rate": 7.165131112686038e-05, + "loss": 1.5805, + "step": 5610 + }, + { + "epoch": 1.7031415996357566, + "grad_norm": 0.8355624675750732, + "learning_rate": 7.164624886099019e-05, + "loss": 1.7984, + "step": 5611 + }, + { + "epoch": 1.703445135832448, + "grad_norm": 0.4628305733203888, + "learning_rate": 7.164118659511998e-05, + "loss": 1.4007, + "step": 5612 + }, + { + "epoch": 1.7037486720291395, + "grad_norm": 0.47250911593437195, + "learning_rate": 7.163612432924978e-05, + "loss": 1.836, + "step": 5613 + }, + { + "epoch": 1.704052208225831, + "grad_norm": 0.43420594930648804, + "learning_rate": 7.163106206337957e-05, + "loss": 1.3316, + "step": 5614 + }, + { + "epoch": 1.7043557444225224, + "grad_norm": 0.5203584432601929, + "learning_rate": 7.162599979750937e-05, + "loss": 2.0178, + "step": 5615 + }, + { + "epoch": 1.7046592806192138, + "grad_norm": 0.6097245216369629, + "learning_rate": 7.162093753163916e-05, + "loss": 1.3596, + "step": 5616 + }, + { + "epoch": 1.7049628168159052, + "grad_norm": 0.6182841062545776, + "learning_rate": 7.161587526576896e-05, + "loss": 1.5702, + "step": 5617 + }, + { + "epoch": 1.7052663530125969, + "grad_norm": 0.5049747228622437, + "learning_rate": 7.161081299989875e-05, + "loss": 1.7493, + "step": 5618 + }, + { + "epoch": 1.705569889209288, + "grad_norm": 0.5657998323440552, + "learning_rate": 7.160575073402855e-05, + "loss": 1.8784, + "step": 5619 + }, + { + "epoch": 1.7058734254059797, + "grad_norm": 0.6417815089225769, + "learning_rate": 7.160068846815836e-05, + "loss": 1.5989, + "step": 5620 + }, + { + "epoch": 1.706176961602671, + "grad_norm": 0.5697042942047119, + "learning_rate": 7.159562620228815e-05, + "loss": 1.7344, + "step": 5621 + }, + { + "epoch": 1.7064804977993626, + "grad_norm": 0.6542501449584961, + "learning_rate": 7.159056393641795e-05, + "loss": 1.7907, + "step": 5622 + }, + { + "epoch": 1.706784033996054, + "grad_norm": 0.5756997466087341, + "learning_rate": 7.158550167054774e-05, + "loss": 1.7232, + "step": 5623 + }, + { + "epoch": 1.7070875701927455, + "grad_norm": 0.43723130226135254, + "learning_rate": 7.158043940467754e-05, + "loss": 1.6653, + "step": 5624 + }, + { + "epoch": 1.707391106389437, + "grad_norm": 0.5785560011863708, + "learning_rate": 7.157537713880733e-05, + "loss": 1.3398, + "step": 5625 + }, + { + "epoch": 1.7076946425861284, + "grad_norm": 0.7036682367324829, + "learning_rate": 7.157031487293713e-05, + "loss": 1.7571, + "step": 5626 + }, + { + "epoch": 1.70799817878282, + "grad_norm": 0.9839090704917908, + "learning_rate": 7.156525260706692e-05, + "loss": 1.5356, + "step": 5627 + }, + { + "epoch": 1.7083017149795112, + "grad_norm": 0.4686150550842285, + "learning_rate": 7.156019034119672e-05, + "loss": 1.7265, + "step": 5628 + }, + { + "epoch": 1.7086052511762029, + "grad_norm": 0.5854867100715637, + "learning_rate": 7.155512807532651e-05, + "loss": 1.9357, + "step": 5629 + }, + { + "epoch": 1.708908787372894, + "grad_norm": 0.5611643195152283, + "learning_rate": 7.155006580945632e-05, + "loss": 1.416, + "step": 5630 + }, + { + "epoch": 1.7092123235695857, + "grad_norm": 0.5986021161079407, + "learning_rate": 7.154500354358611e-05, + "loss": 1.5378, + "step": 5631 + }, + { + "epoch": 1.7095158597662772, + "grad_norm": 0.5221402049064636, + "learning_rate": 7.153994127771591e-05, + "loss": 1.6811, + "step": 5632 + }, + { + "epoch": 1.7098193959629686, + "grad_norm": 0.5451841950416565, + "learning_rate": 7.15348790118457e-05, + "loss": 1.7699, + "step": 5633 + }, + { + "epoch": 1.71012293215966, + "grad_norm": 0.44219285249710083, + "learning_rate": 7.15298167459755e-05, + "loss": 1.3474, + "step": 5634 + }, + { + "epoch": 1.7104264683563515, + "grad_norm": 0.5203813910484314, + "learning_rate": 7.152475448010529e-05, + "loss": 1.3492, + "step": 5635 + }, + { + "epoch": 1.710730004553043, + "grad_norm": 0.538673460483551, + "learning_rate": 7.151969221423509e-05, + "loss": 1.7115, + "step": 5636 + }, + { + "epoch": 1.7110335407497343, + "grad_norm": 0.4732709527015686, + "learning_rate": 7.151462994836488e-05, + "loss": 1.8847, + "step": 5637 + }, + { + "epoch": 1.711337076946426, + "grad_norm": 0.619422435760498, + "learning_rate": 7.150956768249468e-05, + "loss": 1.7269, + "step": 5638 + }, + { + "epoch": 1.7116406131431172, + "grad_norm": 0.38589712977409363, + "learning_rate": 7.150450541662448e-05, + "loss": 1.6737, + "step": 5639 + }, + { + "epoch": 1.7119441493398089, + "grad_norm": 0.514140784740448, + "learning_rate": 7.149944315075428e-05, + "loss": 1.8193, + "step": 5640 + }, + { + "epoch": 1.7122476855365, + "grad_norm": 4.0823493003845215, + "learning_rate": 7.149438088488409e-05, + "loss": 1.8681, + "step": 5641 + }, + { + "epoch": 1.7125512217331917, + "grad_norm": 0.6078541874885559, + "learning_rate": 7.148931861901388e-05, + "loss": 1.7254, + "step": 5642 + }, + { + "epoch": 1.7128547579298832, + "grad_norm": 0.5429568886756897, + "learning_rate": 7.148425635314368e-05, + "loss": 1.7894, + "step": 5643 + }, + { + "epoch": 1.7131582941265746, + "grad_norm": 0.4650183320045471, + "learning_rate": 7.147919408727347e-05, + "loss": 1.4881, + "step": 5644 + }, + { + "epoch": 1.713461830323266, + "grad_norm": 0.5098140835762024, + "learning_rate": 7.147413182140327e-05, + "loss": 1.7343, + "step": 5645 + }, + { + "epoch": 1.7137653665199575, + "grad_norm": 0.5701392889022827, + "learning_rate": 7.146906955553306e-05, + "loss": 1.6419, + "step": 5646 + }, + { + "epoch": 1.714068902716649, + "grad_norm": 0.6229302883148193, + "learning_rate": 7.146400728966286e-05, + "loss": 1.4583, + "step": 5647 + }, + { + "epoch": 1.7143724389133403, + "grad_norm": 0.6150268912315369, + "learning_rate": 7.145894502379265e-05, + "loss": 1.4473, + "step": 5648 + }, + { + "epoch": 1.714675975110032, + "grad_norm": 0.5583786964416504, + "learning_rate": 7.145388275792245e-05, + "loss": 1.7754, + "step": 5649 + }, + { + "epoch": 1.7149795113067232, + "grad_norm": 0.4834759831428528, + "learning_rate": 7.144882049205225e-05, + "loss": 1.1987, + "step": 5650 + }, + { + "epoch": 1.7152830475034149, + "grad_norm": 0.6216395497322083, + "learning_rate": 7.144375822618205e-05, + "loss": 2.0237, + "step": 5651 + }, + { + "epoch": 1.715586583700106, + "grad_norm": 0.4688428044319153, + "learning_rate": 7.143869596031184e-05, + "loss": 0.7604, + "step": 5652 + }, + { + "epoch": 1.7158901198967977, + "grad_norm": 0.45872947573661804, + "learning_rate": 7.143363369444164e-05, + "loss": 1.4403, + "step": 5653 + }, + { + "epoch": 1.7161936560934892, + "grad_norm": 0.49328091740608215, + "learning_rate": 7.142857142857143e-05, + "loss": 1.4274, + "step": 5654 + }, + { + "epoch": 1.7164971922901806, + "grad_norm": 0.5940247178077698, + "learning_rate": 7.142350916270123e-05, + "loss": 1.6925, + "step": 5655 + }, + { + "epoch": 1.716800728486872, + "grad_norm": 0.4424358308315277, + "learning_rate": 7.141844689683102e-05, + "loss": 1.5618, + "step": 5656 + }, + { + "epoch": 1.7171042646835635, + "grad_norm": 0.4610697329044342, + "learning_rate": 7.141338463096082e-05, + "loss": 1.2538, + "step": 5657 + }, + { + "epoch": 1.7174078008802551, + "grad_norm": 0.5324227809906006, + "learning_rate": 7.140832236509061e-05, + "loss": 1.4877, + "step": 5658 + }, + { + "epoch": 1.7177113370769463, + "grad_norm": 0.5017738938331604, + "learning_rate": 7.140326009922042e-05, + "loss": 1.7222, + "step": 5659 + }, + { + "epoch": 1.718014873273638, + "grad_norm": 0.5225081443786621, + "learning_rate": 7.139819783335022e-05, + "loss": 1.6279, + "step": 5660 + }, + { + "epoch": 1.7183184094703292, + "grad_norm": 0.49963754415512085, + "learning_rate": 7.139313556748001e-05, + "loss": 1.6897, + "step": 5661 + }, + { + "epoch": 1.7186219456670209, + "grad_norm": 0.5684502124786377, + "learning_rate": 7.13880733016098e-05, + "loss": 1.9999, + "step": 5662 + }, + { + "epoch": 1.7189254818637123, + "grad_norm": 0.560808002948761, + "learning_rate": 7.13830110357396e-05, + "loss": 1.7136, + "step": 5663 + }, + { + "epoch": 1.7192290180604037, + "grad_norm": 0.6076765656471252, + "learning_rate": 7.13779487698694e-05, + "loss": 1.5934, + "step": 5664 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.47294458746910095, + "learning_rate": 7.137288650399919e-05, + "loss": 1.3456, + "step": 5665 + }, + { + "epoch": 1.7198360904537866, + "grad_norm": 0.7062551975250244, + "learning_rate": 7.136782423812899e-05, + "loss": 1.3726, + "step": 5666 + }, + { + "epoch": 1.720139626650478, + "grad_norm": 0.6528676152229309, + "learning_rate": 7.136276197225878e-05, + "loss": 1.3429, + "step": 5667 + }, + { + "epoch": 1.7204431628471695, + "grad_norm": 0.6872678995132446, + "learning_rate": 7.135769970638857e-05, + "loss": 1.1651, + "step": 5668 + }, + { + "epoch": 1.7207466990438611, + "grad_norm": 0.8360339403152466, + "learning_rate": 7.135263744051838e-05, + "loss": 1.7624, + "step": 5669 + }, + { + "epoch": 1.7210502352405523, + "grad_norm": 0.45598936080932617, + "learning_rate": 7.134757517464818e-05, + "loss": 1.818, + "step": 5670 + }, + { + "epoch": 1.721353771437244, + "grad_norm": 0.5153748989105225, + "learning_rate": 7.134251290877797e-05, + "loss": 1.8247, + "step": 5671 + }, + { + "epoch": 1.7216573076339352, + "grad_norm": 0.5611364245414734, + "learning_rate": 7.133745064290777e-05, + "loss": 1.4998, + "step": 5672 + }, + { + "epoch": 1.7219608438306269, + "grad_norm": 0.6226168274879456, + "learning_rate": 7.133238837703756e-05, + "loss": 1.7166, + "step": 5673 + }, + { + "epoch": 1.7222643800273183, + "grad_norm": 0.5680972933769226, + "learning_rate": 7.132732611116736e-05, + "loss": 0.956, + "step": 5674 + }, + { + "epoch": 1.7225679162240097, + "grad_norm": 0.522735595703125, + "learning_rate": 7.132226384529715e-05, + "loss": 1.7789, + "step": 5675 + }, + { + "epoch": 1.7228714524207012, + "grad_norm": 0.39815565943717957, + "learning_rate": 7.131720157942695e-05, + "loss": 1.639, + "step": 5676 + }, + { + "epoch": 1.7231749886173926, + "grad_norm": 0.5538575053215027, + "learning_rate": 7.131213931355674e-05, + "loss": 1.7431, + "step": 5677 + }, + { + "epoch": 1.723478524814084, + "grad_norm": 0.6126648783683777, + "learning_rate": 7.130707704768655e-05, + "loss": 1.571, + "step": 5678 + }, + { + "epoch": 1.7237820610107755, + "grad_norm": 0.6345686316490173, + "learning_rate": 7.130201478181634e-05, + "loss": 1.6244, + "step": 5679 + }, + { + "epoch": 1.7240855972074671, + "grad_norm": 0.5709595084190369, + "learning_rate": 7.129695251594614e-05, + "loss": 1.6958, + "step": 5680 + }, + { + "epoch": 1.7243891334041583, + "grad_norm": 0.5866050124168396, + "learning_rate": 7.129189025007593e-05, + "loss": 1.5665, + "step": 5681 + }, + { + "epoch": 1.72469266960085, + "grad_norm": 0.5642903447151184, + "learning_rate": 7.128682798420573e-05, + "loss": 1.6998, + "step": 5682 + }, + { + "epoch": 1.7249962057975412, + "grad_norm": 0.4493815302848816, + "learning_rate": 7.128176571833554e-05, + "loss": 1.916, + "step": 5683 + }, + { + "epoch": 1.7252997419942329, + "grad_norm": 0.5203521251678467, + "learning_rate": 7.127670345246533e-05, + "loss": 1.4566, + "step": 5684 + }, + { + "epoch": 1.7256032781909243, + "grad_norm": 0.42749977111816406, + "learning_rate": 7.127164118659513e-05, + "loss": 1.138, + "step": 5685 + }, + { + "epoch": 1.7259068143876157, + "grad_norm": 0.535605788230896, + "learning_rate": 7.126657892072492e-05, + "loss": 1.449, + "step": 5686 + }, + { + "epoch": 1.7262103505843072, + "grad_norm": 0.5324421525001526, + "learning_rate": 7.126151665485472e-05, + "loss": 1.6838, + "step": 5687 + }, + { + "epoch": 1.7265138867809986, + "grad_norm": 0.48239511251449585, + "learning_rate": 7.125645438898451e-05, + "loss": 1.656, + "step": 5688 + }, + { + "epoch": 1.7268174229776903, + "grad_norm": 0.44394174218177795, + "learning_rate": 7.125139212311432e-05, + "loss": 1.2908, + "step": 5689 + }, + { + "epoch": 1.7271209591743815, + "grad_norm": 0.6110196709632874, + "learning_rate": 7.124632985724411e-05, + "loss": 1.0364, + "step": 5690 + }, + { + "epoch": 1.7274244953710731, + "grad_norm": 0.4668317139148712, + "learning_rate": 7.124126759137391e-05, + "loss": 2.1075, + "step": 5691 + }, + { + "epoch": 1.7277280315677643, + "grad_norm": 0.79306960105896, + "learning_rate": 7.12362053255037e-05, + "loss": 1.1159, + "step": 5692 + }, + { + "epoch": 1.728031567764456, + "grad_norm": 0.7084735631942749, + "learning_rate": 7.12311430596335e-05, + "loss": 1.9203, + "step": 5693 + }, + { + "epoch": 1.7283351039611472, + "grad_norm": 0.5400936007499695, + "learning_rate": 7.12260807937633e-05, + "loss": 1.6333, + "step": 5694 + }, + { + "epoch": 1.7286386401578389, + "grad_norm": 0.5740994215011597, + "learning_rate": 7.122101852789309e-05, + "loss": 1.7508, + "step": 5695 + }, + { + "epoch": 1.7289421763545303, + "grad_norm": 0.7395409345626831, + "learning_rate": 7.121595626202288e-05, + "loss": 1.5679, + "step": 5696 + }, + { + "epoch": 1.7292457125512217, + "grad_norm": 0.5085437893867493, + "learning_rate": 7.121089399615268e-05, + "loss": 1.6915, + "step": 5697 + }, + { + "epoch": 1.7295492487479132, + "grad_norm": 0.5388838648796082, + "learning_rate": 7.120583173028249e-05, + "loss": 1.3938, + "step": 5698 + }, + { + "epoch": 1.7298527849446046, + "grad_norm": 0.46243810653686523, + "learning_rate": 7.120076946441228e-05, + "loss": 1.3695, + "step": 5699 + }, + { + "epoch": 1.7301563211412962, + "grad_norm": 0.5669201612472534, + "learning_rate": 7.119570719854208e-05, + "loss": 1.7704, + "step": 5700 + }, + { + "epoch": 1.7304598573379875, + "grad_norm": 0.5119979977607727, + "learning_rate": 7.119064493267187e-05, + "loss": 1.6959, + "step": 5701 + }, + { + "epoch": 1.7307633935346791, + "grad_norm": 0.6088730096817017, + "learning_rate": 7.118558266680167e-05, + "loss": 1.6063, + "step": 5702 + }, + { + "epoch": 1.7310669297313703, + "grad_norm": 0.510960578918457, + "learning_rate": 7.118052040093146e-05, + "loss": 1.7251, + "step": 5703 + }, + { + "epoch": 1.731370465928062, + "grad_norm": 0.6074891090393066, + "learning_rate": 7.117545813506125e-05, + "loss": 1.8498, + "step": 5704 + }, + { + "epoch": 1.7316740021247534, + "grad_norm": 0.6876358389854431, + "learning_rate": 7.117039586919105e-05, + "loss": 1.3767, + "step": 5705 + }, + { + "epoch": 1.7319775383214449, + "grad_norm": 0.4814869165420532, + "learning_rate": 7.116533360332084e-05, + "loss": 2.0438, + "step": 5706 + }, + { + "epoch": 1.7322810745181363, + "grad_norm": 0.5223357081413269, + "learning_rate": 7.116027133745064e-05, + "loss": 1.434, + "step": 5707 + }, + { + "epoch": 1.7325846107148277, + "grad_norm": 0.4965895116329193, + "learning_rate": 7.115520907158045e-05, + "loss": 1.5875, + "step": 5708 + }, + { + "epoch": 1.7328881469115192, + "grad_norm": 0.5907561182975769, + "learning_rate": 7.115014680571024e-05, + "loss": 1.0798, + "step": 5709 + }, + { + "epoch": 1.7331916831082106, + "grad_norm": 0.5148065090179443, + "learning_rate": 7.114508453984004e-05, + "loss": 1.3992, + "step": 5710 + }, + { + "epoch": 1.7334952193049022, + "grad_norm": 0.5626041293144226, + "learning_rate": 7.114002227396983e-05, + "loss": 1.6685, + "step": 5711 + }, + { + "epoch": 1.7337987555015935, + "grad_norm": 0.5806806087493896, + "learning_rate": 7.113496000809963e-05, + "loss": 1.3504, + "step": 5712 + }, + { + "epoch": 1.7341022916982851, + "grad_norm": 0.5926372408866882, + "learning_rate": 7.112989774222942e-05, + "loss": 1.9902, + "step": 5713 + }, + { + "epoch": 1.7344058278949763, + "grad_norm": 0.5233840346336365, + "learning_rate": 7.112483547635922e-05, + "loss": 1.7358, + "step": 5714 + }, + { + "epoch": 1.734709364091668, + "grad_norm": 0.5744931101799011, + "learning_rate": 7.111977321048901e-05, + "loss": 1.7516, + "step": 5715 + }, + { + "epoch": 1.7350129002883594, + "grad_norm": 0.9743430018424988, + "learning_rate": 7.11147109446188e-05, + "loss": 1.3766, + "step": 5716 + }, + { + "epoch": 1.7353164364850509, + "grad_norm": 0.6813448667526245, + "learning_rate": 7.110964867874861e-05, + "loss": 1.5492, + "step": 5717 + }, + { + "epoch": 1.7356199726817423, + "grad_norm": 0.5910537838935852, + "learning_rate": 7.110458641287841e-05, + "loss": 1.5894, + "step": 5718 + }, + { + "epoch": 1.7359235088784337, + "grad_norm": 0.4935073256492615, + "learning_rate": 7.10995241470082e-05, + "loss": 1.3373, + "step": 5719 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.596523642539978, + "learning_rate": 7.1094461881138e-05, + "loss": 1.6602, + "step": 5720 + }, + { + "epoch": 1.7365305812718166, + "grad_norm": 0.5797901153564453, + "learning_rate": 7.10893996152678e-05, + "loss": 1.4616, + "step": 5721 + }, + { + "epoch": 1.7368341174685082, + "grad_norm": 0.557640016078949, + "learning_rate": 7.108433734939759e-05, + "loss": 1.5423, + "step": 5722 + }, + { + "epoch": 1.7371376536651995, + "grad_norm": 1.0686063766479492, + "learning_rate": 7.107927508352738e-05, + "loss": 1.7067, + "step": 5723 + }, + { + "epoch": 1.737441189861891, + "grad_norm": 0.6175087690353394, + "learning_rate": 7.107421281765718e-05, + "loss": 1.2277, + "step": 5724 + }, + { + "epoch": 1.7377447260585823, + "grad_norm": 0.5334296226501465, + "learning_rate": 7.106915055178697e-05, + "loss": 1.4583, + "step": 5725 + }, + { + "epoch": 1.738048262255274, + "grad_norm": 0.5804897546768188, + "learning_rate": 7.106408828591678e-05, + "loss": 1.8278, + "step": 5726 + }, + { + "epoch": 1.7383517984519654, + "grad_norm": 0.5825898051261902, + "learning_rate": 7.105902602004658e-05, + "loss": 1.3466, + "step": 5727 + }, + { + "epoch": 1.7386553346486568, + "grad_norm": 0.5392926335334778, + "learning_rate": 7.105396375417638e-05, + "loss": 1.6765, + "step": 5728 + }, + { + "epoch": 1.7389588708453483, + "grad_norm": 0.5440019965171814, + "learning_rate": 7.104890148830618e-05, + "loss": 1.5281, + "step": 5729 + }, + { + "epoch": 1.7392624070420397, + "grad_norm": 0.4941781461238861, + "learning_rate": 7.104383922243597e-05, + "loss": 1.2593, + "step": 5730 + }, + { + "epoch": 1.7395659432387314, + "grad_norm": 0.37533116340637207, + "learning_rate": 7.103877695656577e-05, + "loss": 1.8643, + "step": 5731 + }, + { + "epoch": 1.7398694794354226, + "grad_norm": 1.0915099382400513, + "learning_rate": 7.103371469069556e-05, + "loss": 1.2421, + "step": 5732 + }, + { + "epoch": 1.7401730156321142, + "grad_norm": 0.557999312877655, + "learning_rate": 7.102865242482536e-05, + "loss": 1.6621, + "step": 5733 + }, + { + "epoch": 1.7404765518288055, + "grad_norm": 0.4855442941188812, + "learning_rate": 7.102359015895515e-05, + "loss": 1.1559, + "step": 5734 + }, + { + "epoch": 1.740780088025497, + "grad_norm": 0.6112437844276428, + "learning_rate": 7.101852789308495e-05, + "loss": 1.8523, + "step": 5735 + }, + { + "epoch": 1.7410836242221885, + "grad_norm": 0.5963307023048401, + "learning_rate": 7.101346562721474e-05, + "loss": 1.3682, + "step": 5736 + }, + { + "epoch": 1.74138716041888, + "grad_norm": 0.5917012095451355, + "learning_rate": 7.100840336134455e-05, + "loss": 1.8957, + "step": 5737 + }, + { + "epoch": 1.7416906966155714, + "grad_norm": 0.5097271800041199, + "learning_rate": 7.100334109547435e-05, + "loss": 1.658, + "step": 5738 + }, + { + "epoch": 1.7419942328122628, + "grad_norm": 0.6804052591323853, + "learning_rate": 7.099827882960414e-05, + "loss": 0.9973, + "step": 5739 + }, + { + "epoch": 1.7422977690089543, + "grad_norm": 0.700357973575592, + "learning_rate": 7.099321656373394e-05, + "loss": 1.5626, + "step": 5740 + }, + { + "epoch": 1.7426013052056457, + "grad_norm": 0.7932068109512329, + "learning_rate": 7.098815429786373e-05, + "loss": 2.051, + "step": 5741 + }, + { + "epoch": 1.7429048414023374, + "grad_norm": 0.5094744563102722, + "learning_rate": 7.098309203199352e-05, + "loss": 1.4574, + "step": 5742 + }, + { + "epoch": 1.7432083775990286, + "grad_norm": 0.7267150282859802, + "learning_rate": 7.097802976612332e-05, + "loss": 1.262, + "step": 5743 + }, + { + "epoch": 1.7435119137957202, + "grad_norm": 0.6010065674781799, + "learning_rate": 7.097296750025311e-05, + "loss": 1.4556, + "step": 5744 + }, + { + "epoch": 1.7438154499924114, + "grad_norm": 0.6587292551994324, + "learning_rate": 7.096790523438291e-05, + "loss": 1.6217, + "step": 5745 + }, + { + "epoch": 1.744118986189103, + "grad_norm": 0.7218916416168213, + "learning_rate": 7.09628429685127e-05, + "loss": 1.8298, + "step": 5746 + }, + { + "epoch": 1.7444225223857945, + "grad_norm": 0.6167722344398499, + "learning_rate": 7.095778070264251e-05, + "loss": 1.5336, + "step": 5747 + }, + { + "epoch": 1.744726058582486, + "grad_norm": 0.5541715025901794, + "learning_rate": 7.095271843677231e-05, + "loss": 1.7352, + "step": 5748 + }, + { + "epoch": 1.7450295947791774, + "grad_norm": 0.5876067280769348, + "learning_rate": 7.09476561709021e-05, + "loss": 2.0744, + "step": 5749 + }, + { + "epoch": 1.7453331309758688, + "grad_norm": 0.5202496647834778, + "learning_rate": 7.09425939050319e-05, + "loss": 1.6327, + "step": 5750 + }, + { + "epoch": 1.7456366671725603, + "grad_norm": 0.5222525596618652, + "learning_rate": 7.093753163916169e-05, + "loss": 1.8234, + "step": 5751 + }, + { + "epoch": 1.7459402033692517, + "grad_norm": 0.6135051846504211, + "learning_rate": 7.093246937329149e-05, + "loss": 1.5676, + "step": 5752 + }, + { + "epoch": 1.7462437395659434, + "grad_norm": 0.6794110536575317, + "learning_rate": 7.092740710742128e-05, + "loss": 1.9735, + "step": 5753 + }, + { + "epoch": 1.7465472757626346, + "grad_norm": 0.561379611492157, + "learning_rate": 7.092234484155108e-05, + "loss": 1.4065, + "step": 5754 + }, + { + "epoch": 1.7468508119593262, + "grad_norm": 0.4962792992591858, + "learning_rate": 7.091728257568087e-05, + "loss": 1.3181, + "step": 5755 + }, + { + "epoch": 1.7471543481560174, + "grad_norm": 0.5877513289451599, + "learning_rate": 7.091222030981068e-05, + "loss": 1.7103, + "step": 5756 + }, + { + "epoch": 1.747457884352709, + "grad_norm": 0.5663416981697083, + "learning_rate": 7.090715804394047e-05, + "loss": 1.1936, + "step": 5757 + }, + { + "epoch": 1.7477614205494005, + "grad_norm": 0.5573774576187134, + "learning_rate": 7.090209577807027e-05, + "loss": 1.7181, + "step": 5758 + }, + { + "epoch": 1.748064956746092, + "grad_norm": 0.8682475686073303, + "learning_rate": 7.089703351220006e-05, + "loss": 1.8022, + "step": 5759 + }, + { + "epoch": 1.7483684929427834, + "grad_norm": 0.444024920463562, + "learning_rate": 7.089197124632986e-05, + "loss": 1.9491, + "step": 5760 + }, + { + "epoch": 1.7486720291394748, + "grad_norm": 0.5527752041816711, + "learning_rate": 7.088690898045965e-05, + "loss": 1.2612, + "step": 5761 + }, + { + "epoch": 1.7489755653361665, + "grad_norm": 0.514806866645813, + "learning_rate": 7.088184671458945e-05, + "loss": 2.1253, + "step": 5762 + }, + { + "epoch": 1.7492791015328577, + "grad_norm": 0.5180836319923401, + "learning_rate": 7.087678444871924e-05, + "loss": 1.4736, + "step": 5763 + }, + { + "epoch": 1.7495826377295494, + "grad_norm": 0.7507419586181641, + "learning_rate": 7.087172218284904e-05, + "loss": 1.1092, + "step": 5764 + }, + { + "epoch": 1.7498861739262406, + "grad_norm": 0.6647083163261414, + "learning_rate": 7.086665991697885e-05, + "loss": 1.6887, + "step": 5765 + }, + { + "epoch": 1.7501897101229322, + "grad_norm": 0.5800061225891113, + "learning_rate": 7.086159765110864e-05, + "loss": 1.7121, + "step": 5766 + }, + { + "epoch": 1.7504932463196237, + "grad_norm": 0.5175750255584717, + "learning_rate": 7.085653538523844e-05, + "loss": 1.373, + "step": 5767 + }, + { + "epoch": 1.750796782516315, + "grad_norm": 0.4834209978580475, + "learning_rate": 7.085147311936823e-05, + "loss": 1.2436, + "step": 5768 + }, + { + "epoch": 1.7511003187130065, + "grad_norm": 0.4929172396659851, + "learning_rate": 7.084641085349803e-05, + "loss": 1.1829, + "step": 5769 + }, + { + "epoch": 1.751403854909698, + "grad_norm": 0.5788953900337219, + "learning_rate": 7.084134858762782e-05, + "loss": 1.7369, + "step": 5770 + }, + { + "epoch": 1.7517073911063894, + "grad_norm": 0.65016108751297, + "learning_rate": 7.083628632175761e-05, + "loss": 1.3848, + "step": 5771 + }, + { + "epoch": 1.7520109273030808, + "grad_norm": 0.6161928772926331, + "learning_rate": 7.083122405588742e-05, + "loss": 1.5332, + "step": 5772 + }, + { + "epoch": 1.7523144634997725, + "grad_norm": 0.6327143907546997, + "learning_rate": 7.082616179001722e-05, + "loss": 1.742, + "step": 5773 + }, + { + "epoch": 1.7526179996964637, + "grad_norm": 0.5929992198944092, + "learning_rate": 7.082109952414701e-05, + "loss": 1.6308, + "step": 5774 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.5704889893531799, + "learning_rate": 7.081603725827681e-05, + "loss": 1.4623, + "step": 5775 + }, + { + "epoch": 1.7532250720898466, + "grad_norm": 0.47019699215888977, + "learning_rate": 7.081097499240662e-05, + "loss": 1.4923, + "step": 5776 + }, + { + "epoch": 1.7535286082865382, + "grad_norm": 0.5670586824417114, + "learning_rate": 7.080591272653641e-05, + "loss": 1.9408, + "step": 5777 + }, + { + "epoch": 1.7538321444832297, + "grad_norm": 0.6063165664672852, + "learning_rate": 7.08008504606662e-05, + "loss": 0.6937, + "step": 5778 + }, + { + "epoch": 1.754135680679921, + "grad_norm": 0.6302130222320557, + "learning_rate": 7.0795788194796e-05, + "loss": 1.7437, + "step": 5779 + }, + { + "epoch": 1.7544392168766125, + "grad_norm": 0.5551527738571167, + "learning_rate": 7.07907259289258e-05, + "loss": 1.8204, + "step": 5780 + }, + { + "epoch": 1.754742753073304, + "grad_norm": 0.5558610558509827, + "learning_rate": 7.078566366305559e-05, + "loss": 1.2973, + "step": 5781 + }, + { + "epoch": 1.7550462892699954, + "grad_norm": 0.6446595788002014, + "learning_rate": 7.078060139718538e-05, + "loss": 1.2467, + "step": 5782 + }, + { + "epoch": 1.7553498254666868, + "grad_norm": 0.5184798836708069, + "learning_rate": 7.077553913131518e-05, + "loss": 1.7002, + "step": 5783 + }, + { + "epoch": 1.7556533616633785, + "grad_norm": 0.7394595742225647, + "learning_rate": 7.077047686544497e-05, + "loss": 1.8713, + "step": 5784 + }, + { + "epoch": 1.7559568978600697, + "grad_norm": 0.5985365509986877, + "learning_rate": 7.076541459957477e-05, + "loss": 1.8934, + "step": 5785 + }, + { + "epoch": 1.7562604340567614, + "grad_norm": 0.5682666897773743, + "learning_rate": 7.076035233370458e-05, + "loss": 1.9157, + "step": 5786 + }, + { + "epoch": 1.7565639702534526, + "grad_norm": 0.6347185373306274, + "learning_rate": 7.075529006783437e-05, + "loss": 1.7947, + "step": 5787 + }, + { + "epoch": 1.7568675064501442, + "grad_norm": 0.7159053683280945, + "learning_rate": 7.075022780196417e-05, + "loss": 1.6663, + "step": 5788 + }, + { + "epoch": 1.7571710426468357, + "grad_norm": 0.5242490172386169, + "learning_rate": 7.074516553609396e-05, + "loss": 1.6238, + "step": 5789 + }, + { + "epoch": 1.757474578843527, + "grad_norm": 0.9489251971244812, + "learning_rate": 7.074010327022376e-05, + "loss": 1.729, + "step": 5790 + }, + { + "epoch": 1.7577781150402185, + "grad_norm": 0.5605430006980896, + "learning_rate": 7.073504100435355e-05, + "loss": 1.7129, + "step": 5791 + }, + { + "epoch": 1.75808165123691, + "grad_norm": 0.512266993522644, + "learning_rate": 7.072997873848335e-05, + "loss": 1.9685, + "step": 5792 + }, + { + "epoch": 1.7583851874336016, + "grad_norm": 0.5452455878257751, + "learning_rate": 7.072491647261314e-05, + "loss": 2.1181, + "step": 5793 + }, + { + "epoch": 1.7586887236302928, + "grad_norm": 0.9478041529655457, + "learning_rate": 7.071985420674294e-05, + "loss": 1.327, + "step": 5794 + }, + { + "epoch": 1.7589922598269845, + "grad_norm": 0.881410539150238, + "learning_rate": 7.071479194087274e-05, + "loss": 1.4846, + "step": 5795 + }, + { + "epoch": 1.7592957960236757, + "grad_norm": 0.5429690480232239, + "learning_rate": 7.070972967500254e-05, + "loss": 1.5724, + "step": 5796 + }, + { + "epoch": 1.7595993322203674, + "grad_norm": 0.48222050070762634, + "learning_rate": 7.070466740913233e-05, + "loss": 1.8012, + "step": 5797 + }, + { + "epoch": 1.7599028684170588, + "grad_norm": 0.5965589880943298, + "learning_rate": 7.069960514326213e-05, + "loss": 1.5959, + "step": 5798 + }, + { + "epoch": 1.7602064046137502, + "grad_norm": 1.3540230989456177, + "learning_rate": 7.069454287739192e-05, + "loss": 1.1057, + "step": 5799 + }, + { + "epoch": 1.7605099408104417, + "grad_norm": 0.6451812982559204, + "learning_rate": 7.068948061152172e-05, + "loss": 1.8921, + "step": 5800 + }, + { + "epoch": 1.760813477007133, + "grad_norm": 0.535523533821106, + "learning_rate": 7.068441834565151e-05, + "loss": 1.9626, + "step": 5801 + }, + { + "epoch": 1.7611170132038245, + "grad_norm": 0.6170904636383057, + "learning_rate": 7.067935607978131e-05, + "loss": 1.7806, + "step": 5802 + }, + { + "epoch": 1.761420549400516, + "grad_norm": 0.4982869327068329, + "learning_rate": 7.06742938139111e-05, + "loss": 1.9225, + "step": 5803 + }, + { + "epoch": 1.7617240855972076, + "grad_norm": 0.6037775874137878, + "learning_rate": 7.066923154804091e-05, + "loss": 1.7682, + "step": 5804 + }, + { + "epoch": 1.7620276217938988, + "grad_norm": 0.535892128944397, + "learning_rate": 7.06641692821707e-05, + "loss": 1.8156, + "step": 5805 + }, + { + "epoch": 1.7623311579905905, + "grad_norm": 0.5253079533576965, + "learning_rate": 7.06591070163005e-05, + "loss": 1.7642, + "step": 5806 + }, + { + "epoch": 1.7626346941872817, + "grad_norm": 0.5811476111412048, + "learning_rate": 7.06540447504303e-05, + "loss": 1.8639, + "step": 5807 + }, + { + "epoch": 1.7629382303839733, + "grad_norm": 0.5927221179008484, + "learning_rate": 7.064898248456009e-05, + "loss": 1.5973, + "step": 5808 + }, + { + "epoch": 1.7632417665806648, + "grad_norm": 0.6458206176757812, + "learning_rate": 7.064392021868988e-05, + "loss": 1.6549, + "step": 5809 + }, + { + "epoch": 1.7635453027773562, + "grad_norm": 0.4807237684726715, + "learning_rate": 7.063885795281968e-05, + "loss": 1.5492, + "step": 5810 + }, + { + "epoch": 1.7638488389740477, + "grad_norm": 0.4759950637817383, + "learning_rate": 7.063379568694947e-05, + "loss": 1.7667, + "step": 5811 + }, + { + "epoch": 1.764152375170739, + "grad_norm": 0.6172373294830322, + "learning_rate": 7.062873342107927e-05, + "loss": 1.4779, + "step": 5812 + }, + { + "epoch": 1.7644559113674305, + "grad_norm": 0.5831183791160583, + "learning_rate": 7.062367115520906e-05, + "loss": 1.6003, + "step": 5813 + }, + { + "epoch": 1.764759447564122, + "grad_norm": 0.4175955057144165, + "learning_rate": 7.061860888933887e-05, + "loss": 1.9885, + "step": 5814 + }, + { + "epoch": 1.7650629837608136, + "grad_norm": 0.5079640746116638, + "learning_rate": 7.061354662346867e-05, + "loss": 1.8727, + "step": 5815 + }, + { + "epoch": 1.7653665199575048, + "grad_norm": 0.5410943627357483, + "learning_rate": 7.060848435759846e-05, + "loss": 1.1928, + "step": 5816 + }, + { + "epoch": 1.7656700561541965, + "grad_norm": 0.504618227481842, + "learning_rate": 7.060342209172827e-05, + "loss": 2.2719, + "step": 5817 + }, + { + "epoch": 1.7659735923508877, + "grad_norm": 0.602778971195221, + "learning_rate": 7.059835982585806e-05, + "loss": 1.6486, + "step": 5818 + }, + { + "epoch": 1.7662771285475793, + "grad_norm": 0.5493549108505249, + "learning_rate": 7.059329755998786e-05, + "loss": 1.3315, + "step": 5819 + }, + { + "epoch": 1.7665806647442708, + "grad_norm": 0.5952966213226318, + "learning_rate": 7.058823529411765e-05, + "loss": 1.5493, + "step": 5820 + }, + { + "epoch": 1.7668842009409622, + "grad_norm": 0.616404116153717, + "learning_rate": 7.058317302824745e-05, + "loss": 1.1801, + "step": 5821 + }, + { + "epoch": 1.7671877371376536, + "grad_norm": 0.8588623404502869, + "learning_rate": 7.057811076237724e-05, + "loss": 1.5727, + "step": 5822 + }, + { + "epoch": 1.767491273334345, + "grad_norm": 0.40923357009887695, + "learning_rate": 7.057304849650704e-05, + "loss": 1.3718, + "step": 5823 + }, + { + "epoch": 1.7677948095310367, + "grad_norm": 0.5931923389434814, + "learning_rate": 7.056798623063683e-05, + "loss": 1.5876, + "step": 5824 + }, + { + "epoch": 1.768098345727728, + "grad_norm": 0.51994788646698, + "learning_rate": 7.056292396476664e-05, + "loss": 1.8799, + "step": 5825 + }, + { + "epoch": 1.7684018819244196, + "grad_norm": 0.5930038690567017, + "learning_rate": 7.055786169889644e-05, + "loss": 1.8165, + "step": 5826 + }, + { + "epoch": 1.7687054181211108, + "grad_norm": 0.5225912928581238, + "learning_rate": 7.055279943302623e-05, + "loss": 1.3906, + "step": 5827 + }, + { + "epoch": 1.7690089543178025, + "grad_norm": 0.45339706540107727, + "learning_rate": 7.054773716715603e-05, + "loss": 2.0455, + "step": 5828 + }, + { + "epoch": 1.769312490514494, + "grad_norm": 1.311185359954834, + "learning_rate": 7.054267490128582e-05, + "loss": 1.5411, + "step": 5829 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.5847686529159546, + "learning_rate": 7.053761263541562e-05, + "loss": 1.4527, + "step": 5830 + }, + { + "epoch": 1.7699195629078768, + "grad_norm": 0.6320043802261353, + "learning_rate": 7.053255036954541e-05, + "loss": 1.2047, + "step": 5831 + }, + { + "epoch": 1.7702230991045682, + "grad_norm": 0.7766631841659546, + "learning_rate": 7.05274881036752e-05, + "loss": 1.7178, + "step": 5832 + }, + { + "epoch": 1.7705266353012596, + "grad_norm": 0.49876946210861206, + "learning_rate": 7.0522425837805e-05, + "loss": 1.7734, + "step": 5833 + }, + { + "epoch": 1.770830171497951, + "grad_norm": 0.6119621396064758, + "learning_rate": 7.051736357193481e-05, + "loss": 1.7981, + "step": 5834 + }, + { + "epoch": 1.7711337076946427, + "grad_norm": 0.5587700605392456, + "learning_rate": 7.05123013060646e-05, + "loss": 1.9976, + "step": 5835 + }, + { + "epoch": 1.771437243891334, + "grad_norm": 0.6698164939880371, + "learning_rate": 7.05072390401944e-05, + "loss": 1.563, + "step": 5836 + }, + { + "epoch": 1.7717407800880256, + "grad_norm": 0.5238813161849976, + "learning_rate": 7.050217677432419e-05, + "loss": 1.7543, + "step": 5837 + }, + { + "epoch": 1.7720443162847168, + "grad_norm": 0.7682486176490784, + "learning_rate": 7.049711450845399e-05, + "loss": 1.6731, + "step": 5838 + }, + { + "epoch": 1.7723478524814085, + "grad_norm": 0.4635404944419861, + "learning_rate": 7.049205224258378e-05, + "loss": 1.321, + "step": 5839 + }, + { + "epoch": 1.7726513886781, + "grad_norm": 0.579387903213501, + "learning_rate": 7.048698997671358e-05, + "loss": 1.6947, + "step": 5840 + }, + { + "epoch": 1.7729549248747913, + "grad_norm": 0.5389410257339478, + "learning_rate": 7.048192771084337e-05, + "loss": 1.5536, + "step": 5841 + }, + { + "epoch": 1.7732584610714828, + "grad_norm": 0.6099681258201599, + "learning_rate": 7.047686544497317e-05, + "loss": 1.6241, + "step": 5842 + }, + { + "epoch": 1.7735619972681742, + "grad_norm": 0.5654680728912354, + "learning_rate": 7.047180317910298e-05, + "loss": 1.6327, + "step": 5843 + }, + { + "epoch": 1.7738655334648656, + "grad_norm": 0.643012285232544, + "learning_rate": 7.046674091323277e-05, + "loss": 1.6656, + "step": 5844 + }, + { + "epoch": 1.774169069661557, + "grad_norm": 0.5325130224227905, + "learning_rate": 7.046167864736256e-05, + "loss": 1.5048, + "step": 5845 + }, + { + "epoch": 1.7744726058582487, + "grad_norm": 0.42735743522644043, + "learning_rate": 7.045661638149236e-05, + "loss": 1.0482, + "step": 5846 + }, + { + "epoch": 1.77477614205494, + "grad_norm": 0.5796374678611755, + "learning_rate": 7.045155411562215e-05, + "loss": 1.5523, + "step": 5847 + }, + { + "epoch": 1.7750796782516316, + "grad_norm": 0.5250108242034912, + "learning_rate": 7.044649184975195e-05, + "loss": 1.8127, + "step": 5848 + }, + { + "epoch": 1.7753832144483228, + "grad_norm": 0.9125270247459412, + "learning_rate": 7.044142958388174e-05, + "loss": 1.751, + "step": 5849 + }, + { + "epoch": 1.7756867506450145, + "grad_norm": 0.6005438566207886, + "learning_rate": 7.043636731801154e-05, + "loss": 1.609, + "step": 5850 + }, + { + "epoch": 1.775990286841706, + "grad_norm": 0.5507426261901855, + "learning_rate": 7.043130505214133e-05, + "loss": 1.7003, + "step": 5851 + }, + { + "epoch": 1.7762938230383973, + "grad_norm": 0.6349962949752808, + "learning_rate": 7.042624278627113e-05, + "loss": 1.5438, + "step": 5852 + }, + { + "epoch": 1.7765973592350888, + "grad_norm": 1.0367538928985596, + "learning_rate": 7.042118052040094e-05, + "loss": 1.5797, + "step": 5853 + }, + { + "epoch": 1.7769008954317802, + "grad_norm": 0.6401512026786804, + "learning_rate": 7.041611825453073e-05, + "loss": 1.5123, + "step": 5854 + }, + { + "epoch": 1.7772044316284719, + "grad_norm": 0.5885390043258667, + "learning_rate": 7.041105598866053e-05, + "loss": 1.0058, + "step": 5855 + }, + { + "epoch": 1.777507967825163, + "grad_norm": 0.4374138414859772, + "learning_rate": 7.040599372279032e-05, + "loss": 1.7388, + "step": 5856 + }, + { + "epoch": 1.7778115040218547, + "grad_norm": 0.5692790150642395, + "learning_rate": 7.040093145692012e-05, + "loss": 1.3957, + "step": 5857 + }, + { + "epoch": 1.778115040218546, + "grad_norm": 0.6237412691116333, + "learning_rate": 7.039586919104991e-05, + "loss": 1.6133, + "step": 5858 + }, + { + "epoch": 1.7784185764152376, + "grad_norm": 0.5770452618598938, + "learning_rate": 7.03908069251797e-05, + "loss": 1.6974, + "step": 5859 + }, + { + "epoch": 1.7787221126119288, + "grad_norm": 0.5058612823486328, + "learning_rate": 7.03857446593095e-05, + "loss": 1.9792, + "step": 5860 + }, + { + "epoch": 1.7790256488086205, + "grad_norm": 0.545733630657196, + "learning_rate": 7.038068239343931e-05, + "loss": 1.5064, + "step": 5861 + }, + { + "epoch": 1.779329185005312, + "grad_norm": 0.6026157140731812, + "learning_rate": 7.03756201275691e-05, + "loss": 2.1842, + "step": 5862 + }, + { + "epoch": 1.7796327212020033, + "grad_norm": 0.5493044257164001, + "learning_rate": 7.03705578616989e-05, + "loss": 1.8828, + "step": 5863 + }, + { + "epoch": 1.7799362573986948, + "grad_norm": 0.666919469833374, + "learning_rate": 7.03654955958287e-05, + "loss": 1.1003, + "step": 5864 + }, + { + "epoch": 1.7802397935953862, + "grad_norm": 0.529765248298645, + "learning_rate": 7.03604333299585e-05, + "loss": 2.1024, + "step": 5865 + }, + { + "epoch": 1.7805433297920779, + "grad_norm": 0.7152818441390991, + "learning_rate": 7.03553710640883e-05, + "loss": 1.2247, + "step": 5866 + }, + { + "epoch": 1.780846865988769, + "grad_norm": 0.5502989888191223, + "learning_rate": 7.035030879821809e-05, + "loss": 1.592, + "step": 5867 + }, + { + "epoch": 1.7811504021854607, + "grad_norm": 0.47319549322128296, + "learning_rate": 7.034524653234789e-05, + "loss": 1.7258, + "step": 5868 + }, + { + "epoch": 1.781453938382152, + "grad_norm": 0.5717188119888306, + "learning_rate": 7.034018426647768e-05, + "loss": 1.4437, + "step": 5869 + }, + { + "epoch": 1.7817574745788436, + "grad_norm": 0.5776780843734741, + "learning_rate": 7.033512200060748e-05, + "loss": 1.7443, + "step": 5870 + }, + { + "epoch": 1.782061010775535, + "grad_norm": 0.47379469871520996, + "learning_rate": 7.033005973473727e-05, + "loss": 1.5137, + "step": 5871 + }, + { + "epoch": 1.7823645469722265, + "grad_norm": 0.49418696761131287, + "learning_rate": 7.032499746886707e-05, + "loss": 1.591, + "step": 5872 + }, + { + "epoch": 1.782668083168918, + "grad_norm": 0.5168330669403076, + "learning_rate": 7.031993520299687e-05, + "loss": 1.9866, + "step": 5873 + }, + { + "epoch": 1.7829716193656093, + "grad_norm": 0.5431966781616211, + "learning_rate": 7.031487293712667e-05, + "loss": 1.6641, + "step": 5874 + }, + { + "epoch": 1.7832751555623008, + "grad_norm": 0.5647885799407959, + "learning_rate": 7.030981067125646e-05, + "loss": 1.2665, + "step": 5875 + }, + { + "epoch": 1.7835786917589922, + "grad_norm": 0.5556349754333496, + "learning_rate": 7.030474840538626e-05, + "loss": 1.9022, + "step": 5876 + }, + { + "epoch": 1.7838822279556839, + "grad_norm": 0.6246395707130432, + "learning_rate": 7.029968613951605e-05, + "loss": 1.1401, + "step": 5877 + }, + { + "epoch": 1.784185764152375, + "grad_norm": 0.47130244970321655, + "learning_rate": 7.029462387364585e-05, + "loss": 1.8312, + "step": 5878 + }, + { + "epoch": 1.7844893003490667, + "grad_norm": 0.49159568548202515, + "learning_rate": 7.028956160777564e-05, + "loss": 1.2571, + "step": 5879 + }, + { + "epoch": 1.784792836545758, + "grad_norm": 0.9010233879089355, + "learning_rate": 7.028449934190544e-05, + "loss": 1.2944, + "step": 5880 + }, + { + "epoch": 1.7850963727424496, + "grad_norm": 1.0043572187423706, + "learning_rate": 7.027943707603523e-05, + "loss": 1.2684, + "step": 5881 + }, + { + "epoch": 1.785399908939141, + "grad_norm": 0.5217756032943726, + "learning_rate": 7.027437481016504e-05, + "loss": 1.7476, + "step": 5882 + }, + { + "epoch": 1.7857034451358325, + "grad_norm": 0.6179821491241455, + "learning_rate": 7.026931254429483e-05, + "loss": 1.4536, + "step": 5883 + }, + { + "epoch": 1.786006981332524, + "grad_norm": 0.5888343453407288, + "learning_rate": 7.026425027842463e-05, + "loss": 1.3274, + "step": 5884 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.5379050970077515, + "learning_rate": 7.025918801255442e-05, + "loss": 1.6149, + "step": 5885 + }, + { + "epoch": 1.7866140537259068, + "grad_norm": 0.7219739556312561, + "learning_rate": 7.025412574668422e-05, + "loss": 1.459, + "step": 5886 + }, + { + "epoch": 1.7869175899225982, + "grad_norm": 0.4842709004878998, + "learning_rate": 7.024906348081401e-05, + "loss": 1.684, + "step": 5887 + }, + { + "epoch": 1.7872211261192898, + "grad_norm": 0.575980544090271, + "learning_rate": 7.024400121494381e-05, + "loss": 1.7599, + "step": 5888 + }, + { + "epoch": 1.787524662315981, + "grad_norm": 0.483018696308136, + "learning_rate": 7.02389389490736e-05, + "loss": 1.355, + "step": 5889 + }, + { + "epoch": 1.7878281985126727, + "grad_norm": 0.7090207934379578, + "learning_rate": 7.02338766832034e-05, + "loss": 1.8047, + "step": 5890 + }, + { + "epoch": 1.788131734709364, + "grad_norm": 0.5750283002853394, + "learning_rate": 7.02288144173332e-05, + "loss": 1.4583, + "step": 5891 + }, + { + "epoch": 1.7884352709060556, + "grad_norm": 0.5741564631462097, + "learning_rate": 7.0223752151463e-05, + "loss": 1.875, + "step": 5892 + }, + { + "epoch": 1.788738807102747, + "grad_norm": 0.5058529376983643, + "learning_rate": 7.02186898855928e-05, + "loss": 1.2128, + "step": 5893 + }, + { + "epoch": 1.7890423432994385, + "grad_norm": 0.6110987067222595, + "learning_rate": 7.021362761972259e-05, + "loss": 1.9625, + "step": 5894 + }, + { + "epoch": 1.7893458794961299, + "grad_norm": 0.594071626663208, + "learning_rate": 7.020856535385239e-05, + "loss": 1.862, + "step": 5895 + }, + { + "epoch": 1.7896494156928213, + "grad_norm": 0.818956732749939, + "learning_rate": 7.020350308798218e-05, + "loss": 1.6269, + "step": 5896 + }, + { + "epoch": 1.789952951889513, + "grad_norm": 0.5960368514060974, + "learning_rate": 7.019844082211198e-05, + "loss": 1.7182, + "step": 5897 + }, + { + "epoch": 1.7902564880862042, + "grad_norm": 0.5818991661071777, + "learning_rate": 7.019337855624177e-05, + "loss": 1.9811, + "step": 5898 + }, + { + "epoch": 1.7905600242828958, + "grad_norm": 0.4691188931465149, + "learning_rate": 7.018831629037157e-05, + "loss": 1.7591, + "step": 5899 + }, + { + "epoch": 1.790863560479587, + "grad_norm": 0.4774368703365326, + "learning_rate": 7.018325402450136e-05, + "loss": 1.6917, + "step": 5900 + }, + { + "epoch": 1.7911670966762787, + "grad_norm": 0.5307349562644958, + "learning_rate": 7.017819175863117e-05, + "loss": 1.743, + "step": 5901 + }, + { + "epoch": 1.7914706328729701, + "grad_norm": 0.572593092918396, + "learning_rate": 7.017312949276096e-05, + "loss": 1.5527, + "step": 5902 + }, + { + "epoch": 1.7917741690696616, + "grad_norm": 0.8130362033843994, + "learning_rate": 7.016806722689076e-05, + "loss": 1.9604, + "step": 5903 + }, + { + "epoch": 1.792077705266353, + "grad_norm": 0.637360155582428, + "learning_rate": 7.016300496102055e-05, + "loss": 1.3893, + "step": 5904 + }, + { + "epoch": 1.7923812414630445, + "grad_norm": 0.5618795156478882, + "learning_rate": 7.015794269515035e-05, + "loss": 1.8735, + "step": 5905 + }, + { + "epoch": 1.7926847776597359, + "grad_norm": 0.4983277916908264, + "learning_rate": 7.015288042928016e-05, + "loss": 1.6255, + "step": 5906 + }, + { + "epoch": 1.7929883138564273, + "grad_norm": 0.6057456731796265, + "learning_rate": 7.014781816340995e-05, + "loss": 1.6916, + "step": 5907 + }, + { + "epoch": 1.793291850053119, + "grad_norm": 0.5264350771903992, + "learning_rate": 7.014275589753975e-05, + "loss": 1.5999, + "step": 5908 + }, + { + "epoch": 1.7935953862498102, + "grad_norm": 1.0011577606201172, + "learning_rate": 7.013769363166954e-05, + "loss": 1.3513, + "step": 5909 + }, + { + "epoch": 1.7938989224465018, + "grad_norm": 0.6390252709388733, + "learning_rate": 7.013263136579934e-05, + "loss": 1.4275, + "step": 5910 + }, + { + "epoch": 1.794202458643193, + "grad_norm": 0.5232053995132446, + "learning_rate": 7.012756909992913e-05, + "loss": 1.6905, + "step": 5911 + }, + { + "epoch": 1.7945059948398847, + "grad_norm": 0.6496378183364868, + "learning_rate": 7.012250683405894e-05, + "loss": 1.4173, + "step": 5912 + }, + { + "epoch": 1.7948095310365761, + "grad_norm": 0.825714111328125, + "learning_rate": 7.011744456818873e-05, + "loss": 1.9885, + "step": 5913 + }, + { + "epoch": 1.7951130672332676, + "grad_norm": 0.6410032510757446, + "learning_rate": 7.011238230231853e-05, + "loss": 1.7687, + "step": 5914 + }, + { + "epoch": 1.795416603429959, + "grad_norm": 0.7020335793495178, + "learning_rate": 7.010732003644832e-05, + "loss": 2.0146, + "step": 5915 + }, + { + "epoch": 1.7957201396266504, + "grad_norm": 1.3581609725952148, + "learning_rate": 7.010225777057812e-05, + "loss": 1.1118, + "step": 5916 + }, + { + "epoch": 1.7960236758233419, + "grad_norm": 0.6407718658447266, + "learning_rate": 7.009719550470791e-05, + "loss": 1.0059, + "step": 5917 + }, + { + "epoch": 1.7963272120200333, + "grad_norm": 0.5095187425613403, + "learning_rate": 7.009213323883771e-05, + "loss": 1.7162, + "step": 5918 + }, + { + "epoch": 1.796630748216725, + "grad_norm": 0.5506390333175659, + "learning_rate": 7.00870709729675e-05, + "loss": 1.8015, + "step": 5919 + }, + { + "epoch": 1.7969342844134162, + "grad_norm": 0.4706830084323883, + "learning_rate": 7.00820087070973e-05, + "loss": 1.8148, + "step": 5920 + }, + { + "epoch": 1.7972378206101078, + "grad_norm": 0.4986935555934906, + "learning_rate": 7.00769464412271e-05, + "loss": 1.1621, + "step": 5921 + }, + { + "epoch": 1.797541356806799, + "grad_norm": 0.5431498885154724, + "learning_rate": 7.00718841753569e-05, + "loss": 1.8391, + "step": 5922 + }, + { + "epoch": 1.7978448930034907, + "grad_norm": 0.7347373962402344, + "learning_rate": 7.00668219094867e-05, + "loss": 1.2171, + "step": 5923 + }, + { + "epoch": 1.7981484292001821, + "grad_norm": 0.5176038146018982, + "learning_rate": 7.006175964361649e-05, + "loss": 1.299, + "step": 5924 + }, + { + "epoch": 1.7984519653968736, + "grad_norm": 0.5122864246368408, + "learning_rate": 7.005669737774628e-05, + "loss": 1.7784, + "step": 5925 + }, + { + "epoch": 1.798755501593565, + "grad_norm": 0.5709760785102844, + "learning_rate": 7.005163511187608e-05, + "loss": 1.0928, + "step": 5926 + }, + { + "epoch": 1.7990590377902564, + "grad_norm": 0.5653720498085022, + "learning_rate": 7.004657284600587e-05, + "loss": 1.7408, + "step": 5927 + }, + { + "epoch": 1.799362573986948, + "grad_norm": 0.610870897769928, + "learning_rate": 7.004151058013567e-05, + "loss": 1.9208, + "step": 5928 + }, + { + "epoch": 1.7996661101836393, + "grad_norm": 0.5883700847625732, + "learning_rate": 7.003644831426546e-05, + "loss": 1.7159, + "step": 5929 + }, + { + "epoch": 1.799969646380331, + "grad_norm": 0.5038648247718811, + "learning_rate": 7.003138604839526e-05, + "loss": 1.6974, + "step": 5930 + }, + { + "epoch": 1.8002731825770222, + "grad_norm": 0.4701935052871704, + "learning_rate": 7.002632378252507e-05, + "loss": 1.6792, + "step": 5931 + }, + { + "epoch": 1.8005767187737138, + "grad_norm": 0.5894878506660461, + "learning_rate": 7.002126151665486e-05, + "loss": 1.8782, + "step": 5932 + }, + { + "epoch": 1.8008802549704053, + "grad_norm": 0.5552791953086853, + "learning_rate": 7.001619925078466e-05, + "loss": 1.7514, + "step": 5933 + }, + { + "epoch": 1.8011837911670967, + "grad_norm": 0.5044195652008057, + "learning_rate": 7.001113698491445e-05, + "loss": 1.6754, + "step": 5934 + }, + { + "epoch": 1.8014873273637881, + "grad_norm": 0.5654546618461609, + "learning_rate": 7.000607471904425e-05, + "loss": 1.6864, + "step": 5935 + }, + { + "epoch": 1.8017908635604796, + "grad_norm": 0.5445656776428223, + "learning_rate": 7.000101245317404e-05, + "loss": 1.498, + "step": 5936 + }, + { + "epoch": 1.802094399757171, + "grad_norm": 0.5514572262763977, + "learning_rate": 6.999595018730384e-05, + "loss": 1.327, + "step": 5937 + }, + { + "epoch": 1.8023979359538624, + "grad_norm": 0.546735942363739, + "learning_rate": 6.999088792143363e-05, + "loss": 1.5668, + "step": 5938 + }, + { + "epoch": 1.802701472150554, + "grad_norm": 0.516412615776062, + "learning_rate": 6.998582565556342e-05, + "loss": 1.8425, + "step": 5939 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.5725192427635193, + "learning_rate": 6.998076338969323e-05, + "loss": 1.8569, + "step": 5940 + }, + { + "epoch": 1.803308544543937, + "grad_norm": 0.5454580783843994, + "learning_rate": 6.997570112382303e-05, + "loss": 1.9791, + "step": 5941 + }, + { + "epoch": 1.8036120807406282, + "grad_norm": 0.4446016848087311, + "learning_rate": 6.997063885795282e-05, + "loss": 1.5544, + "step": 5942 + }, + { + "epoch": 1.8039156169373198, + "grad_norm": 0.5859190821647644, + "learning_rate": 6.996557659208262e-05, + "loss": 1.5727, + "step": 5943 + }, + { + "epoch": 1.8042191531340113, + "grad_norm": 0.614378809928894, + "learning_rate": 6.996051432621241e-05, + "loss": 1.7846, + "step": 5944 + }, + { + "epoch": 1.8045226893307027, + "grad_norm": 0.6583629846572876, + "learning_rate": 6.995545206034221e-05, + "loss": 1.7894, + "step": 5945 + }, + { + "epoch": 1.8048262255273941, + "grad_norm": 0.6289352774620056, + "learning_rate": 6.9950389794472e-05, + "loss": 2.2178, + "step": 5946 + }, + { + "epoch": 1.8051297617240856, + "grad_norm": 0.5429802536964417, + "learning_rate": 6.99453275286018e-05, + "loss": 1.5751, + "step": 5947 + }, + { + "epoch": 1.805433297920777, + "grad_norm": 0.4763343036174774, + "learning_rate": 6.994026526273159e-05, + "loss": 1.7212, + "step": 5948 + }, + { + "epoch": 1.8057368341174684, + "grad_norm": 0.5531217455863953, + "learning_rate": 6.99352029968614e-05, + "loss": 1.4387, + "step": 5949 + }, + { + "epoch": 1.80604037031416, + "grad_norm": 0.4886355698108673, + "learning_rate": 6.99301407309912e-05, + "loss": 1.8291, + "step": 5950 + }, + { + "epoch": 1.8063439065108513, + "grad_norm": 0.4299159646034241, + "learning_rate": 6.9925078465121e-05, + "loss": 1.406, + "step": 5951 + }, + { + "epoch": 1.806647442707543, + "grad_norm": 0.5187563896179199, + "learning_rate": 6.99200161992508e-05, + "loss": 1.7927, + "step": 5952 + }, + { + "epoch": 1.8069509789042342, + "grad_norm": 0.5152180790901184, + "learning_rate": 6.991495393338059e-05, + "loss": 1.6718, + "step": 5953 + }, + { + "epoch": 1.8072545151009258, + "grad_norm": 0.5023542046546936, + "learning_rate": 6.990989166751039e-05, + "loss": 1.6979, + "step": 5954 + }, + { + "epoch": 1.8075580512976173, + "grad_norm": 0.5216313600540161, + "learning_rate": 6.990482940164018e-05, + "loss": 1.5704, + "step": 5955 + }, + { + "epoch": 1.8078615874943087, + "grad_norm": 0.5471825003623962, + "learning_rate": 6.989976713576998e-05, + "loss": 1.835, + "step": 5956 + }, + { + "epoch": 1.8081651236910001, + "grad_norm": 0.5783551931381226, + "learning_rate": 6.989470486989977e-05, + "loss": 1.7769, + "step": 5957 + }, + { + "epoch": 1.8084686598876916, + "grad_norm": 0.46548882126808167, + "learning_rate": 6.988964260402957e-05, + "loss": 1.7211, + "step": 5958 + }, + { + "epoch": 1.8087721960843832, + "grad_norm": 0.6428399085998535, + "learning_rate": 6.988458033815936e-05, + "loss": 1.7598, + "step": 5959 + }, + { + "epoch": 1.8090757322810744, + "grad_norm": 0.5859332084655762, + "learning_rate": 6.987951807228917e-05, + "loss": 1.6466, + "step": 5960 + }, + { + "epoch": 1.809379268477766, + "grad_norm": 0.5805972218513489, + "learning_rate": 6.987445580641896e-05, + "loss": 1.3684, + "step": 5961 + }, + { + "epoch": 1.8096828046744573, + "grad_norm": 0.49208566546440125, + "learning_rate": 6.986939354054876e-05, + "loss": 1.6773, + "step": 5962 + }, + { + "epoch": 1.809986340871149, + "grad_norm": 0.5302795171737671, + "learning_rate": 6.986433127467855e-05, + "loss": 1.7629, + "step": 5963 + }, + { + "epoch": 1.8102898770678404, + "grad_norm": 0.48644882440567017, + "learning_rate": 6.985926900880835e-05, + "loss": 1.8528, + "step": 5964 + }, + { + "epoch": 1.8105934132645318, + "grad_norm": 0.6011199355125427, + "learning_rate": 6.985420674293814e-05, + "loss": 1.8147, + "step": 5965 + }, + { + "epoch": 1.8108969494612233, + "grad_norm": 0.45972317457199097, + "learning_rate": 6.984914447706794e-05, + "loss": 1.6178, + "step": 5966 + }, + { + "epoch": 1.8112004856579147, + "grad_norm": 0.5446584224700928, + "learning_rate": 6.984408221119773e-05, + "loss": 1.5146, + "step": 5967 + }, + { + "epoch": 1.8115040218546061, + "grad_norm": 0.5274298787117004, + "learning_rate": 6.983901994532753e-05, + "loss": 1.464, + "step": 5968 + }, + { + "epoch": 1.8118075580512976, + "grad_norm": 0.5610344409942627, + "learning_rate": 6.983395767945732e-05, + "loss": 2.077, + "step": 5969 + }, + { + "epoch": 1.8121110942479892, + "grad_norm": 0.48236846923828125, + "learning_rate": 6.982889541358713e-05, + "loss": 1.5399, + "step": 5970 + }, + { + "epoch": 1.8124146304446804, + "grad_norm": 0.7982462644577026, + "learning_rate": 6.982383314771693e-05, + "loss": 1.6216, + "step": 5971 + }, + { + "epoch": 1.812718166641372, + "grad_norm": 0.48303601145744324, + "learning_rate": 6.981877088184672e-05, + "loss": 1.945, + "step": 5972 + }, + { + "epoch": 1.8130217028380633, + "grad_norm": 0.45602136850357056, + "learning_rate": 6.981370861597652e-05, + "loss": 1.8444, + "step": 5973 + }, + { + "epoch": 1.813325239034755, + "grad_norm": 0.5214483737945557, + "learning_rate": 6.980864635010631e-05, + "loss": 1.8304, + "step": 5974 + }, + { + "epoch": 1.8136287752314464, + "grad_norm": 0.5384705662727356, + "learning_rate": 6.98035840842361e-05, + "loss": 1.7733, + "step": 5975 + }, + { + "epoch": 1.8139323114281378, + "grad_norm": 0.6643903255462646, + "learning_rate": 6.97985218183659e-05, + "loss": 1.0964, + "step": 5976 + }, + { + "epoch": 1.8142358476248293, + "grad_norm": 0.5094102621078491, + "learning_rate": 6.97934595524957e-05, + "loss": 1.9589, + "step": 5977 + }, + { + "epoch": 1.8145393838215207, + "grad_norm": 0.5107044577598572, + "learning_rate": 6.978839728662549e-05, + "loss": 1.9909, + "step": 5978 + }, + { + "epoch": 1.8148429200182121, + "grad_norm": 0.577189028263092, + "learning_rate": 6.97833350207553e-05, + "loss": 0.9188, + "step": 5979 + }, + { + "epoch": 1.8151464562149036, + "grad_norm": 0.5533971786499023, + "learning_rate": 6.977827275488509e-05, + "loss": 1.5661, + "step": 5980 + }, + { + "epoch": 1.8154499924115952, + "grad_norm": 0.5704061985015869, + "learning_rate": 6.977321048901489e-05, + "loss": 1.415, + "step": 5981 + }, + { + "epoch": 1.8157535286082864, + "grad_norm": 0.5985412001609802, + "learning_rate": 6.976814822314468e-05, + "loss": 1.7265, + "step": 5982 + }, + { + "epoch": 1.816057064804978, + "grad_norm": 0.48246708512306213, + "learning_rate": 6.976308595727448e-05, + "loss": 1.3756, + "step": 5983 + }, + { + "epoch": 1.8163606010016693, + "grad_norm": 0.5040408372879028, + "learning_rate": 6.975802369140427e-05, + "loss": 1.8844, + "step": 5984 + }, + { + "epoch": 1.816664137198361, + "grad_norm": 0.5499937534332275, + "learning_rate": 6.975296142553407e-05, + "loss": 1.3734, + "step": 5985 + }, + { + "epoch": 1.8169676733950524, + "grad_norm": 0.6828827857971191, + "learning_rate": 6.974789915966386e-05, + "loss": 1.3821, + "step": 5986 + }, + { + "epoch": 1.8172712095917438, + "grad_norm": 0.5618600845336914, + "learning_rate": 6.974283689379366e-05, + "loss": 1.5585, + "step": 5987 + }, + { + "epoch": 1.8175747457884353, + "grad_norm": 0.6279012560844421, + "learning_rate": 6.973777462792346e-05, + "loss": 1.4342, + "step": 5988 + }, + { + "epoch": 1.8178782819851267, + "grad_norm": 0.5523353815078735, + "learning_rate": 6.973271236205326e-05, + "loss": 1.8851, + "step": 5989 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.5792304873466492, + "learning_rate": 6.972765009618305e-05, + "loss": 1.878, + "step": 5990 + }, + { + "epoch": 1.8184853543785096, + "grad_norm": 0.4983103275299072, + "learning_rate": 6.972258783031285e-05, + "loss": 1.7997, + "step": 5991 + }, + { + "epoch": 1.8187888905752012, + "grad_norm": 0.5694151520729065, + "learning_rate": 6.971752556444264e-05, + "loss": 1.7064, + "step": 5992 + }, + { + "epoch": 1.8190924267718924, + "grad_norm": 0.5514373779296875, + "learning_rate": 6.971246329857244e-05, + "loss": 1.6783, + "step": 5993 + }, + { + "epoch": 1.819395962968584, + "grad_norm": 0.5489677786827087, + "learning_rate": 6.970740103270223e-05, + "loss": 1.7817, + "step": 5994 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.6167088150978088, + "learning_rate": 6.970233876683204e-05, + "loss": 1.577, + "step": 5995 + }, + { + "epoch": 1.820003035361967, + "grad_norm": 0.6246216297149658, + "learning_rate": 6.969727650096184e-05, + "loss": 1.7529, + "step": 5996 + }, + { + "epoch": 1.8203065715586584, + "grad_norm": 0.5288349986076355, + "learning_rate": 6.969221423509163e-05, + "loss": 1.4161, + "step": 5997 + }, + { + "epoch": 1.8206101077553498, + "grad_norm": 0.5577011704444885, + "learning_rate": 6.968715196922143e-05, + "loss": 1.3679, + "step": 5998 + }, + { + "epoch": 1.8209136439520412, + "grad_norm": 0.5881639719009399, + "learning_rate": 6.968208970335123e-05, + "loss": 1.3722, + "step": 5999 + }, + { + "epoch": 1.8212171801487327, + "grad_norm": 0.5512107610702515, + "learning_rate": 6.967702743748103e-05, + "loss": 1.6681, + "step": 6000 + }, + { + "epoch": 1.8215207163454243, + "grad_norm": 0.574345588684082, + "learning_rate": 6.967196517161082e-05, + "loss": 1.5918, + "step": 6001 + }, + { + "epoch": 1.8218242525421156, + "grad_norm": 0.7422041893005371, + "learning_rate": 6.966690290574062e-05, + "loss": 1.6145, + "step": 6002 + }, + { + "epoch": 1.8221277887388072, + "grad_norm": 0.5157979726791382, + "learning_rate": 6.966184063987041e-05, + "loss": 1.9659, + "step": 6003 + }, + { + "epoch": 1.8224313249354984, + "grad_norm": 0.608417272567749, + "learning_rate": 6.965677837400021e-05, + "loss": 1.8913, + "step": 6004 + }, + { + "epoch": 1.82273486113219, + "grad_norm": 0.5558058023452759, + "learning_rate": 6.965171610813e-05, + "loss": 1.4399, + "step": 6005 + }, + { + "epoch": 1.8230383973288815, + "grad_norm": 0.5612147450447083, + "learning_rate": 6.96466538422598e-05, + "loss": 1.7211, + "step": 6006 + }, + { + "epoch": 1.823341933525573, + "grad_norm": 0.5312124490737915, + "learning_rate": 6.964159157638959e-05, + "loss": 1.8771, + "step": 6007 + }, + { + "epoch": 1.8236454697222644, + "grad_norm": 0.5921664237976074, + "learning_rate": 6.963652931051939e-05, + "loss": 1.8804, + "step": 6008 + }, + { + "epoch": 1.8239490059189558, + "grad_norm": 0.4935623109340668, + "learning_rate": 6.96314670446492e-05, + "loss": 1.0369, + "step": 6009 + }, + { + "epoch": 1.8242525421156472, + "grad_norm": 0.4994628429412842, + "learning_rate": 6.962640477877899e-05, + "loss": 1.9222, + "step": 6010 + }, + { + "epoch": 1.8245560783123387, + "grad_norm": 0.5165558457374573, + "learning_rate": 6.962134251290879e-05, + "loss": 1.731, + "step": 6011 + }, + { + "epoch": 1.8248596145090303, + "grad_norm": 0.5655200481414795, + "learning_rate": 6.961628024703858e-05, + "loss": 1.6356, + "step": 6012 + }, + { + "epoch": 1.8251631507057215, + "grad_norm": 0.4943961203098297, + "learning_rate": 6.961121798116838e-05, + "loss": 1.9832, + "step": 6013 + }, + { + "epoch": 1.8254666869024132, + "grad_norm": 0.7183212041854858, + "learning_rate": 6.960615571529817e-05, + "loss": 1.5268, + "step": 6014 + }, + { + "epoch": 1.8257702230991044, + "grad_norm": 0.5787492394447327, + "learning_rate": 6.960109344942796e-05, + "loss": 1.615, + "step": 6015 + }, + { + "epoch": 1.826073759295796, + "grad_norm": 0.5697316527366638, + "learning_rate": 6.959603118355776e-05, + "loss": 1.5962, + "step": 6016 + }, + { + "epoch": 1.8263772954924875, + "grad_norm": 0.9180818200111389, + "learning_rate": 6.959096891768755e-05, + "loss": 1.2139, + "step": 6017 + }, + { + "epoch": 1.826680831689179, + "grad_norm": 0.5522624850273132, + "learning_rate": 6.958590665181736e-05, + "loss": 1.7361, + "step": 6018 + }, + { + "epoch": 1.8269843678858704, + "grad_norm": 0.544629693031311, + "learning_rate": 6.958084438594716e-05, + "loss": 1.7042, + "step": 6019 + }, + { + "epoch": 1.8272879040825618, + "grad_norm": 0.5336179137229919, + "learning_rate": 6.957578212007695e-05, + "loss": 1.5085, + "step": 6020 + }, + { + "epoch": 1.8275914402792535, + "grad_norm": 0.5903141498565674, + "learning_rate": 6.957071985420675e-05, + "loss": 1.801, + "step": 6021 + }, + { + "epoch": 1.8278949764759447, + "grad_norm": 0.5685314536094666, + "learning_rate": 6.956565758833654e-05, + "loss": 1.79, + "step": 6022 + }, + { + "epoch": 1.8281985126726363, + "grad_norm": 0.5545884966850281, + "learning_rate": 6.956059532246634e-05, + "loss": 1.5936, + "step": 6023 + }, + { + "epoch": 1.8285020488693275, + "grad_norm": 0.5583124756813049, + "learning_rate": 6.955553305659613e-05, + "loss": 1.9654, + "step": 6024 + }, + { + "epoch": 1.8288055850660192, + "grad_norm": 0.48344558477401733, + "learning_rate": 6.955047079072593e-05, + "loss": 2.088, + "step": 6025 + }, + { + "epoch": 1.8291091212627104, + "grad_norm": 0.6977788209915161, + "learning_rate": 6.954540852485572e-05, + "loss": 1.6136, + "step": 6026 + }, + { + "epoch": 1.829412657459402, + "grad_norm": 0.5369908213615417, + "learning_rate": 6.954034625898553e-05, + "loss": 1.8515, + "step": 6027 + }, + { + "epoch": 1.8297161936560935, + "grad_norm": 0.6249637007713318, + "learning_rate": 6.953528399311532e-05, + "loss": 1.7288, + "step": 6028 + }, + { + "epoch": 1.830019729852785, + "grad_norm": 0.574105978012085, + "learning_rate": 6.953022172724512e-05, + "loss": 1.933, + "step": 6029 + }, + { + "epoch": 1.8303232660494764, + "grad_norm": 0.5597913861274719, + "learning_rate": 6.952515946137491e-05, + "loss": 1.5223, + "step": 6030 + }, + { + "epoch": 1.8306268022461678, + "grad_norm": 0.5201643705368042, + "learning_rate": 6.952009719550471e-05, + "loss": 1.9055, + "step": 6031 + }, + { + "epoch": 1.8309303384428595, + "grad_norm": 0.5336455702781677, + "learning_rate": 6.95150349296345e-05, + "loss": 1.7269, + "step": 6032 + }, + { + "epoch": 1.8312338746395507, + "grad_norm": 0.5909528732299805, + "learning_rate": 6.95099726637643e-05, + "loss": 1.4887, + "step": 6033 + }, + { + "epoch": 1.8315374108362423, + "grad_norm": 0.4900136888027191, + "learning_rate": 6.950491039789409e-05, + "loss": 0.9843, + "step": 6034 + }, + { + "epoch": 1.8318409470329335, + "grad_norm": 0.5249224305152893, + "learning_rate": 6.949984813202389e-05, + "loss": 1.8408, + "step": 6035 + }, + { + "epoch": 1.8321444832296252, + "grad_norm": 0.6888431310653687, + "learning_rate": 6.949478586615368e-05, + "loss": 1.1868, + "step": 6036 + }, + { + "epoch": 1.8324480194263166, + "grad_norm": 0.5488059520721436, + "learning_rate": 6.948972360028349e-05, + "loss": 2.0866, + "step": 6037 + }, + { + "epoch": 1.832751555623008, + "grad_norm": 0.5993854999542236, + "learning_rate": 6.948466133441329e-05, + "loss": 1.2229, + "step": 6038 + }, + { + "epoch": 1.8330550918196995, + "grad_norm": 0.5585269331932068, + "learning_rate": 6.94795990685431e-05, + "loss": 1.4898, + "step": 6039 + }, + { + "epoch": 1.833358628016391, + "grad_norm": 0.5782963633537292, + "learning_rate": 6.947453680267289e-05, + "loss": 1.7766, + "step": 6040 + }, + { + "epoch": 1.8336621642130824, + "grad_norm": 0.46721434593200684, + "learning_rate": 6.946947453680268e-05, + "loss": 1.261, + "step": 6041 + }, + { + "epoch": 1.8339657004097738, + "grad_norm": 0.682328462600708, + "learning_rate": 6.946441227093248e-05, + "loss": 1.3548, + "step": 6042 + }, + { + "epoch": 1.8342692366064655, + "grad_norm": 0.546238362789154, + "learning_rate": 6.945935000506227e-05, + "loss": 1.3244, + "step": 6043 + }, + { + "epoch": 1.8345727728031567, + "grad_norm": 0.6565511226654053, + "learning_rate": 6.945428773919207e-05, + "loss": 1.7417, + "step": 6044 + }, + { + "epoch": 1.8348763089998483, + "grad_norm": 0.511644184589386, + "learning_rate": 6.944922547332186e-05, + "loss": 1.7742, + "step": 6045 + }, + { + "epoch": 1.8351798451965395, + "grad_norm": 0.7202497124671936, + "learning_rate": 6.944416320745166e-05, + "loss": 1.2906, + "step": 6046 + }, + { + "epoch": 1.8354833813932312, + "grad_norm": 0.4779212176799774, + "learning_rate": 6.943910094158145e-05, + "loss": 1.2606, + "step": 6047 + }, + { + "epoch": 1.8357869175899226, + "grad_norm": 0.4823431670665741, + "learning_rate": 6.943403867571126e-05, + "loss": 1.7771, + "step": 6048 + }, + { + "epoch": 1.836090453786614, + "grad_norm": 0.5160930156707764, + "learning_rate": 6.942897640984106e-05, + "loss": 1.8786, + "step": 6049 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.5408679842948914, + "learning_rate": 6.942391414397085e-05, + "loss": 1.3319, + "step": 6050 + }, + { + "epoch": 1.836697526179997, + "grad_norm": 0.5134124159812927, + "learning_rate": 6.941885187810064e-05, + "loss": 1.754, + "step": 6051 + }, + { + "epoch": 1.8370010623766884, + "grad_norm": 0.48612844944000244, + "learning_rate": 6.941378961223044e-05, + "loss": 1.9355, + "step": 6052 + }, + { + "epoch": 1.8373045985733798, + "grad_norm": 0.44790950417518616, + "learning_rate": 6.940872734636023e-05, + "loss": 1.9657, + "step": 6053 + }, + { + "epoch": 1.8376081347700715, + "grad_norm": 0.4986133277416229, + "learning_rate": 6.940366508049003e-05, + "loss": 1.9317, + "step": 6054 + }, + { + "epoch": 1.8379116709667627, + "grad_norm": 0.5185719132423401, + "learning_rate": 6.939860281461982e-05, + "loss": 1.8908, + "step": 6055 + }, + { + "epoch": 1.8382152071634543, + "grad_norm": 0.5861765146255493, + "learning_rate": 6.939354054874962e-05, + "loss": 1.6715, + "step": 6056 + }, + { + "epoch": 1.8385187433601455, + "grad_norm": 0.5141394138336182, + "learning_rate": 6.938847828287943e-05, + "loss": 1.1495, + "step": 6057 + }, + { + "epoch": 1.8388222795568372, + "grad_norm": 0.4696032702922821, + "learning_rate": 6.938341601700922e-05, + "loss": 1.6885, + "step": 6058 + }, + { + "epoch": 1.8391258157535286, + "grad_norm": 0.546829104423523, + "learning_rate": 6.937835375113902e-05, + "loss": 1.4341, + "step": 6059 + }, + { + "epoch": 1.83942935195022, + "grad_norm": 0.5794239044189453, + "learning_rate": 6.937329148526881e-05, + "loss": 2.0196, + "step": 6060 + }, + { + "epoch": 1.8397328881469115, + "grad_norm": 0.584017276763916, + "learning_rate": 6.93682292193986e-05, + "loss": 2.069, + "step": 6061 + }, + { + "epoch": 1.840036424343603, + "grad_norm": 0.5827233195304871, + "learning_rate": 6.93631669535284e-05, + "loss": 1.7364, + "step": 6062 + }, + { + "epoch": 1.8403399605402946, + "grad_norm": 0.6362305283546448, + "learning_rate": 6.93581046876582e-05, + "loss": 1.8902, + "step": 6063 + }, + { + "epoch": 1.8406434967369858, + "grad_norm": 0.5154918432235718, + "learning_rate": 6.935304242178799e-05, + "loss": 0.9184, + "step": 6064 + }, + { + "epoch": 1.8409470329336775, + "grad_norm": 0.5192784070968628, + "learning_rate": 6.934798015591779e-05, + "loss": 1.5757, + "step": 6065 + }, + { + "epoch": 1.8412505691303687, + "grad_norm": 0.4856669008731842, + "learning_rate": 6.93429178900476e-05, + "loss": 1.2768, + "step": 6066 + }, + { + "epoch": 1.8415541053270603, + "grad_norm": 0.5663694739341736, + "learning_rate": 6.933785562417739e-05, + "loss": 1.6493, + "step": 6067 + }, + { + "epoch": 1.8418576415237518, + "grad_norm": 0.488754004240036, + "learning_rate": 6.933279335830718e-05, + "loss": 1.7893, + "step": 6068 + }, + { + "epoch": 1.8421611777204432, + "grad_norm": 0.5024654269218445, + "learning_rate": 6.932773109243698e-05, + "loss": 1.5329, + "step": 6069 + }, + { + "epoch": 1.8424647139171346, + "grad_norm": 0.6903789639472961, + "learning_rate": 6.932266882656677e-05, + "loss": 1.1917, + "step": 6070 + }, + { + "epoch": 1.842768250113826, + "grad_norm": 0.6057137846946716, + "learning_rate": 6.931760656069657e-05, + "loss": 1.4035, + "step": 6071 + }, + { + "epoch": 1.8430717863105175, + "grad_norm": 0.5684095025062561, + "learning_rate": 6.931254429482636e-05, + "loss": 1.8729, + "step": 6072 + }, + { + "epoch": 1.843375322507209, + "grad_norm": 0.9758940935134888, + "learning_rate": 6.930748202895616e-05, + "loss": 1.3789, + "step": 6073 + }, + { + "epoch": 1.8436788587039006, + "grad_norm": 0.5580344200134277, + "learning_rate": 6.930241976308595e-05, + "loss": 1.3577, + "step": 6074 + }, + { + "epoch": 1.8439823949005918, + "grad_norm": 0.6532706618309021, + "learning_rate": 6.929735749721575e-05, + "loss": 1.7574, + "step": 6075 + }, + { + "epoch": 1.8442859310972834, + "grad_norm": 0.5462925434112549, + "learning_rate": 6.929229523134556e-05, + "loss": 1.6758, + "step": 6076 + }, + { + "epoch": 1.8445894672939747, + "grad_norm": 0.6207025647163391, + "learning_rate": 6.928723296547535e-05, + "loss": 1.4469, + "step": 6077 + }, + { + "epoch": 1.8448930034906663, + "grad_norm": 0.5969544053077698, + "learning_rate": 6.928217069960515e-05, + "loss": 2.0165, + "step": 6078 + }, + { + "epoch": 1.8451965396873578, + "grad_norm": 0.5352455973625183, + "learning_rate": 6.927710843373494e-05, + "loss": 1.9362, + "step": 6079 + }, + { + "epoch": 1.8455000758840492, + "grad_norm": 0.5923967361450195, + "learning_rate": 6.927204616786473e-05, + "loss": 1.5255, + "step": 6080 + }, + { + "epoch": 1.8458036120807406, + "grad_norm": 0.6089999079704285, + "learning_rate": 6.926698390199453e-05, + "loss": 1.5816, + "step": 6081 + }, + { + "epoch": 1.846107148277432, + "grad_norm": 0.602755606174469, + "learning_rate": 6.926192163612432e-05, + "loss": 1.5102, + "step": 6082 + }, + { + "epoch": 1.8464106844741235, + "grad_norm": 0.48048046231269836, + "learning_rate": 6.925685937025412e-05, + "loss": 1.8209, + "step": 6083 + }, + { + "epoch": 1.846714220670815, + "grad_norm": 0.6715882420539856, + "learning_rate": 6.925179710438393e-05, + "loss": 1.3094, + "step": 6084 + }, + { + "epoch": 1.8470177568675066, + "grad_norm": 0.5754781365394592, + "learning_rate": 6.924673483851372e-05, + "loss": 1.6519, + "step": 6085 + }, + { + "epoch": 1.8473212930641978, + "grad_norm": 0.46512678265571594, + "learning_rate": 6.924167257264352e-05, + "loss": 1.461, + "step": 6086 + }, + { + "epoch": 1.8476248292608894, + "grad_norm": 0.5386472940444946, + "learning_rate": 6.923661030677333e-05, + "loss": 1.8878, + "step": 6087 + }, + { + "epoch": 1.8479283654575807, + "grad_norm": 0.6794739365577698, + "learning_rate": 6.923154804090312e-05, + "loss": 1.957, + "step": 6088 + }, + { + "epoch": 1.8482319016542723, + "grad_norm": 0.635474681854248, + "learning_rate": 6.922648577503291e-05, + "loss": 1.138, + "step": 6089 + }, + { + "epoch": 1.8485354378509637, + "grad_norm": 0.8729957938194275, + "learning_rate": 6.922142350916271e-05, + "loss": 1.5292, + "step": 6090 + }, + { + "epoch": 1.8488389740476552, + "grad_norm": 0.5206477642059326, + "learning_rate": 6.92163612432925e-05, + "loss": 1.6186, + "step": 6091 + }, + { + "epoch": 1.8491425102443466, + "grad_norm": 0.5747119784355164, + "learning_rate": 6.92112989774223e-05, + "loss": 1.7004, + "step": 6092 + }, + { + "epoch": 1.849446046441038, + "grad_norm": 0.5725748538970947, + "learning_rate": 6.92062367115521e-05, + "loss": 1.3208, + "step": 6093 + }, + { + "epoch": 1.8497495826377297, + "grad_norm": 0.4918226897716522, + "learning_rate": 6.920117444568189e-05, + "loss": 1.4527, + "step": 6094 + }, + { + "epoch": 1.850053118834421, + "grad_norm": 0.5622021555900574, + "learning_rate": 6.919611217981168e-05, + "loss": 1.4738, + "step": 6095 + }, + { + "epoch": 1.8503566550311126, + "grad_norm": 0.48067110776901245, + "learning_rate": 6.919104991394149e-05, + "loss": 1.7989, + "step": 6096 + }, + { + "epoch": 1.8506601912278038, + "grad_norm": 0.5293039083480835, + "learning_rate": 6.918598764807129e-05, + "loss": 1.7867, + "step": 6097 + }, + { + "epoch": 1.8509637274244954, + "grad_norm": 0.6090536117553711, + "learning_rate": 6.918092538220108e-05, + "loss": 1.3639, + "step": 6098 + }, + { + "epoch": 1.8512672636211869, + "grad_norm": 0.5745795369148254, + "learning_rate": 6.917586311633088e-05, + "loss": 1.1913, + "step": 6099 + }, + { + "epoch": 1.8515707998178783, + "grad_norm": 0.9985912442207336, + "learning_rate": 6.917080085046067e-05, + "loss": 1.5265, + "step": 6100 + }, + { + "epoch": 1.8518743360145697, + "grad_norm": 0.7579218745231628, + "learning_rate": 6.916573858459047e-05, + "loss": 1.8763, + "step": 6101 + }, + { + "epoch": 1.8521778722112612, + "grad_norm": 0.5257670283317566, + "learning_rate": 6.916067631872026e-05, + "loss": 1.5928, + "step": 6102 + }, + { + "epoch": 1.8524814084079526, + "grad_norm": 0.46650373935699463, + "learning_rate": 6.915561405285006e-05, + "loss": 1.5768, + "step": 6103 + }, + { + "epoch": 1.852784944604644, + "grad_norm": 0.4195265471935272, + "learning_rate": 6.915055178697985e-05, + "loss": 1.5363, + "step": 6104 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.5275460481643677, + "learning_rate": 6.914548952110966e-05, + "loss": 1.5679, + "step": 6105 + }, + { + "epoch": 1.853392016998027, + "grad_norm": 0.4654236435890198, + "learning_rate": 6.914042725523945e-05, + "loss": 1.8447, + "step": 6106 + }, + { + "epoch": 1.8536955531947186, + "grad_norm": 0.5061540007591248, + "learning_rate": 6.913536498936925e-05, + "loss": 1.8073, + "step": 6107 + }, + { + "epoch": 1.8539990893914098, + "grad_norm": 0.5491917729377747, + "learning_rate": 6.913030272349904e-05, + "loss": 1.4774, + "step": 6108 + }, + { + "epoch": 1.8543026255881014, + "grad_norm": 0.7270756363868713, + "learning_rate": 6.912524045762884e-05, + "loss": 1.3623, + "step": 6109 + }, + { + "epoch": 1.8546061617847929, + "grad_norm": 0.5843378305435181, + "learning_rate": 6.912017819175863e-05, + "loss": 1.6507, + "step": 6110 + }, + { + "epoch": 1.8549096979814843, + "grad_norm": 0.5927285552024841, + "learning_rate": 6.911511592588843e-05, + "loss": 1.5258, + "step": 6111 + }, + { + "epoch": 1.8552132341781757, + "grad_norm": 0.50548255443573, + "learning_rate": 6.911005366001822e-05, + "loss": 1.3183, + "step": 6112 + }, + { + "epoch": 1.8555167703748672, + "grad_norm": 0.6694241762161255, + "learning_rate": 6.910499139414802e-05, + "loss": 1.4925, + "step": 6113 + }, + { + "epoch": 1.8558203065715586, + "grad_norm": 0.628274142742157, + "learning_rate": 6.909992912827781e-05, + "loss": 1.697, + "step": 6114 + }, + { + "epoch": 1.85612384276825, + "grad_norm": 0.6124897003173828, + "learning_rate": 6.909486686240762e-05, + "loss": 1.6125, + "step": 6115 + }, + { + "epoch": 1.8564273789649417, + "grad_norm": 0.5437746644020081, + "learning_rate": 6.908980459653742e-05, + "loss": 1.4485, + "step": 6116 + }, + { + "epoch": 1.856730915161633, + "grad_norm": 0.5141922831535339, + "learning_rate": 6.908474233066721e-05, + "loss": 1.8619, + "step": 6117 + }, + { + "epoch": 1.8570344513583246, + "grad_norm": 0.5010159611701965, + "learning_rate": 6.9079680064797e-05, + "loss": 1.8498, + "step": 6118 + }, + { + "epoch": 1.8573379875550158, + "grad_norm": 0.44890761375427246, + "learning_rate": 6.90746177989268e-05, + "loss": 1.7194, + "step": 6119 + }, + { + "epoch": 1.8576415237517074, + "grad_norm": 0.5114756226539612, + "learning_rate": 6.90695555330566e-05, + "loss": 1.7011, + "step": 6120 + }, + { + "epoch": 1.8579450599483989, + "grad_norm": 0.5562925934791565, + "learning_rate": 6.906449326718639e-05, + "loss": 1.6741, + "step": 6121 + }, + { + "epoch": 1.8582485961450903, + "grad_norm": 0.631549060344696, + "learning_rate": 6.905943100131618e-05, + "loss": 1.7297, + "step": 6122 + }, + { + "epoch": 1.8585521323417817, + "grad_norm": 0.8341001868247986, + "learning_rate": 6.905436873544598e-05, + "loss": 1.5796, + "step": 6123 + }, + { + "epoch": 1.8588556685384732, + "grad_norm": 0.5715938210487366, + "learning_rate": 6.904930646957579e-05, + "loss": 1.4869, + "step": 6124 + }, + { + "epoch": 1.8591592047351648, + "grad_norm": 0.5713958144187927, + "learning_rate": 6.904424420370558e-05, + "loss": 1.3331, + "step": 6125 + }, + { + "epoch": 1.859462740931856, + "grad_norm": 0.6407903432846069, + "learning_rate": 6.903918193783538e-05, + "loss": 1.5363, + "step": 6126 + }, + { + "epoch": 1.8597662771285477, + "grad_norm": 0.5533556938171387, + "learning_rate": 6.903411967196517e-05, + "loss": 1.8796, + "step": 6127 + }, + { + "epoch": 1.860069813325239, + "grad_norm": 0.9236746430397034, + "learning_rate": 6.902905740609498e-05, + "loss": 1.5267, + "step": 6128 + }, + { + "epoch": 1.8603733495219306, + "grad_norm": 0.6433690190315247, + "learning_rate": 6.902399514022477e-05, + "loss": 1.5664, + "step": 6129 + }, + { + "epoch": 1.860676885718622, + "grad_norm": 0.47277143597602844, + "learning_rate": 6.901893287435457e-05, + "loss": 1.2047, + "step": 6130 + }, + { + "epoch": 1.8609804219153134, + "grad_norm": 0.7604238986968994, + "learning_rate": 6.901387060848436e-05, + "loss": 1.927, + "step": 6131 + }, + { + "epoch": 1.8612839581120049, + "grad_norm": 0.6293209791183472, + "learning_rate": 6.900880834261416e-05, + "loss": 1.9315, + "step": 6132 + }, + { + "epoch": 1.8615874943086963, + "grad_norm": 0.5206697583198547, + "learning_rate": 6.900374607674395e-05, + "loss": 1.6956, + "step": 6133 + }, + { + "epoch": 1.8618910305053877, + "grad_norm": 0.45433950424194336, + "learning_rate": 6.899868381087375e-05, + "loss": 1.5459, + "step": 6134 + }, + { + "epoch": 1.8621945667020792, + "grad_norm": 0.4610427916049957, + "learning_rate": 6.899362154500356e-05, + "loss": 1.9453, + "step": 6135 + }, + { + "epoch": 1.8624981028987708, + "grad_norm": 0.6815734505653381, + "learning_rate": 6.898855927913335e-05, + "loss": 1.4047, + "step": 6136 + }, + { + "epoch": 1.862801639095462, + "grad_norm": 0.5030832886695862, + "learning_rate": 6.898349701326315e-05, + "loss": 1.7727, + "step": 6137 + }, + { + "epoch": 1.8631051752921537, + "grad_norm": 0.4881182014942169, + "learning_rate": 6.897843474739294e-05, + "loss": 0.9128, + "step": 6138 + }, + { + "epoch": 1.863408711488845, + "grad_norm": 0.5349050164222717, + "learning_rate": 6.897337248152274e-05, + "loss": 1.7836, + "step": 6139 + }, + { + "epoch": 1.8637122476855366, + "grad_norm": 0.5377179384231567, + "learning_rate": 6.896831021565253e-05, + "loss": 1.705, + "step": 6140 + }, + { + "epoch": 1.864015783882228, + "grad_norm": 0.5401241779327393, + "learning_rate": 6.896324794978233e-05, + "loss": 1.6809, + "step": 6141 + }, + { + "epoch": 1.8643193200789194, + "grad_norm": 0.4488482177257538, + "learning_rate": 6.895818568391212e-05, + "loss": 1.7207, + "step": 6142 + }, + { + "epoch": 1.8646228562756109, + "grad_norm": 0.878682553768158, + "learning_rate": 6.895312341804192e-05, + "loss": 1.7966, + "step": 6143 + }, + { + "epoch": 1.8649263924723023, + "grad_norm": 0.5451293587684631, + "learning_rate": 6.894806115217172e-05, + "loss": 1.3851, + "step": 6144 + }, + { + "epoch": 1.8652299286689937, + "grad_norm": 0.5849304795265198, + "learning_rate": 6.894299888630152e-05, + "loss": 1.7496, + "step": 6145 + }, + { + "epoch": 1.8655334648656852, + "grad_norm": 0.500056803226471, + "learning_rate": 6.893793662043131e-05, + "loss": 1.9507, + "step": 6146 + }, + { + "epoch": 1.8658370010623768, + "grad_norm": 0.5664394497871399, + "learning_rate": 6.893287435456111e-05, + "loss": 1.7785, + "step": 6147 + }, + { + "epoch": 1.866140537259068, + "grad_norm": 0.5805900692939758, + "learning_rate": 6.89278120886909e-05, + "loss": 1.4488, + "step": 6148 + }, + { + "epoch": 1.8664440734557597, + "grad_norm": 0.5849188566207886, + "learning_rate": 6.89227498228207e-05, + "loss": 1.5485, + "step": 6149 + }, + { + "epoch": 1.866747609652451, + "grad_norm": 0.6293565034866333, + "learning_rate": 6.891768755695049e-05, + "loss": 1.3529, + "step": 6150 + }, + { + "epoch": 1.8670511458491426, + "grad_norm": 0.5155745148658752, + "learning_rate": 6.891262529108029e-05, + "loss": 1.8168, + "step": 6151 + }, + { + "epoch": 1.867354682045834, + "grad_norm": 0.5031374096870422, + "learning_rate": 6.890756302521008e-05, + "loss": 1.6329, + "step": 6152 + }, + { + "epoch": 1.8676582182425254, + "grad_norm": 0.5657221674919128, + "learning_rate": 6.890250075933988e-05, + "loss": 1.9763, + "step": 6153 + }, + { + "epoch": 1.8679617544392169, + "grad_norm": 0.6052532196044922, + "learning_rate": 6.889743849346969e-05, + "loss": 1.8486, + "step": 6154 + }, + { + "epoch": 1.8682652906359083, + "grad_norm": 0.5807188749313354, + "learning_rate": 6.889237622759948e-05, + "loss": 1.7798, + "step": 6155 + }, + { + "epoch": 1.8685688268326, + "grad_norm": 0.5780386328697205, + "learning_rate": 6.888731396172927e-05, + "loss": 1.6593, + "step": 6156 + }, + { + "epoch": 1.8688723630292912, + "grad_norm": 0.5649313926696777, + "learning_rate": 6.888225169585907e-05, + "loss": 1.4904, + "step": 6157 + }, + { + "epoch": 1.8691758992259828, + "grad_norm": 0.5888344645500183, + "learning_rate": 6.887718942998886e-05, + "loss": 1.6099, + "step": 6158 + }, + { + "epoch": 1.869479435422674, + "grad_norm": 0.45079630613327026, + "learning_rate": 6.887212716411866e-05, + "loss": 1.1967, + "step": 6159 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.6415480971336365, + "learning_rate": 6.886706489824845e-05, + "loss": 1.5475, + "step": 6160 + }, + { + "epoch": 1.8700865078160571, + "grad_norm": 0.5611491203308105, + "learning_rate": 6.886200263237825e-05, + "loss": 1.6124, + "step": 6161 + }, + { + "epoch": 1.8703900440127486, + "grad_norm": 0.47479820251464844, + "learning_rate": 6.885694036650804e-05, + "loss": 1.3631, + "step": 6162 + }, + { + "epoch": 1.87069358020944, + "grad_norm": 0.6043630242347717, + "learning_rate": 6.885187810063785e-05, + "loss": 1.9075, + "step": 6163 + }, + { + "epoch": 1.8709971164061314, + "grad_norm": 0.5722765922546387, + "learning_rate": 6.884681583476765e-05, + "loss": 1.8247, + "step": 6164 + }, + { + "epoch": 1.8713006526028229, + "grad_norm": 0.6177046895027161, + "learning_rate": 6.884175356889744e-05, + "loss": 1.8107, + "step": 6165 + }, + { + "epoch": 1.8716041887995143, + "grad_norm": 0.528279721736908, + "learning_rate": 6.883669130302724e-05, + "loss": 1.7669, + "step": 6166 + }, + { + "epoch": 1.871907724996206, + "grad_norm": 0.5595422983169556, + "learning_rate": 6.883162903715703e-05, + "loss": 1.8459, + "step": 6167 + }, + { + "epoch": 1.8722112611928972, + "grad_norm": 0.488857239484787, + "learning_rate": 6.882656677128683e-05, + "loss": 1.7021, + "step": 6168 + }, + { + "epoch": 1.8725147973895888, + "grad_norm": 0.5302273631095886, + "learning_rate": 6.882150450541662e-05, + "loss": 1.2417, + "step": 6169 + }, + { + "epoch": 1.87281833358628, + "grad_norm": 0.4488196074962616, + "learning_rate": 6.881644223954642e-05, + "loss": 1.7382, + "step": 6170 + }, + { + "epoch": 1.8731218697829717, + "grad_norm": 0.5027474761009216, + "learning_rate": 6.881137997367621e-05, + "loss": 1.8507, + "step": 6171 + }, + { + "epoch": 1.8734254059796631, + "grad_norm": 0.44171908497810364, + "learning_rate": 6.880631770780602e-05, + "loss": 0.5499, + "step": 6172 + }, + { + "epoch": 1.8737289421763546, + "grad_norm": 0.6166810989379883, + "learning_rate": 6.880125544193581e-05, + "loss": 1.3108, + "step": 6173 + }, + { + "epoch": 1.874032478373046, + "grad_norm": 0.6441341638565063, + "learning_rate": 6.879619317606562e-05, + "loss": 1.1281, + "step": 6174 + }, + { + "epoch": 1.8743360145697374, + "grad_norm": 0.5491599440574646, + "learning_rate": 6.879113091019542e-05, + "loss": 1.5373, + "step": 6175 + }, + { + "epoch": 1.8746395507664289, + "grad_norm": 0.4813689887523651, + "learning_rate": 6.878606864432521e-05, + "loss": 1.7681, + "step": 6176 + }, + { + "epoch": 1.8749430869631203, + "grad_norm": 0.5879645347595215, + "learning_rate": 6.8781006378455e-05, + "loss": 1.7098, + "step": 6177 + }, + { + "epoch": 1.875246623159812, + "grad_norm": 0.5867793560028076, + "learning_rate": 6.87759441125848e-05, + "loss": 1.4471, + "step": 6178 + }, + { + "epoch": 1.8755501593565032, + "grad_norm": 0.37060195207595825, + "learning_rate": 6.87708818467146e-05, + "loss": 1.1236, + "step": 6179 + }, + { + "epoch": 1.8758536955531948, + "grad_norm": 0.6691044569015503, + "learning_rate": 6.876581958084439e-05, + "loss": 1.0482, + "step": 6180 + }, + { + "epoch": 1.876157231749886, + "grad_norm": 0.6610668897628784, + "learning_rate": 6.876075731497419e-05, + "loss": 1.7718, + "step": 6181 + }, + { + "epoch": 1.8764607679465777, + "grad_norm": 0.6298898458480835, + "learning_rate": 6.875569504910398e-05, + "loss": 1.4496, + "step": 6182 + }, + { + "epoch": 1.8767643041432691, + "grad_norm": 0.5803454518318176, + "learning_rate": 6.875063278323379e-05, + "loss": 1.0997, + "step": 6183 + }, + { + "epoch": 1.8770678403399605, + "grad_norm": 0.6385704874992371, + "learning_rate": 6.874557051736358e-05, + "loss": 1.5478, + "step": 6184 + }, + { + "epoch": 1.877371376536652, + "grad_norm": 0.6655149459838867, + "learning_rate": 6.874050825149338e-05, + "loss": 1.3631, + "step": 6185 + }, + { + "epoch": 1.8776749127333434, + "grad_norm": 0.5710660815238953, + "learning_rate": 6.873544598562317e-05, + "loss": 1.7522, + "step": 6186 + }, + { + "epoch": 1.877978448930035, + "grad_norm": 0.5770572423934937, + "learning_rate": 6.873038371975297e-05, + "loss": 1.4289, + "step": 6187 + }, + { + "epoch": 1.8782819851267263, + "grad_norm": 0.6256627440452576, + "learning_rate": 6.872532145388276e-05, + "loss": 1.9416, + "step": 6188 + }, + { + "epoch": 1.878585521323418, + "grad_norm": 0.6375580430030823, + "learning_rate": 6.872025918801256e-05, + "loss": 1.4679, + "step": 6189 + }, + { + "epoch": 1.8788890575201092, + "grad_norm": 0.5342978835105896, + "learning_rate": 6.871519692214235e-05, + "loss": 1.6705, + "step": 6190 + }, + { + "epoch": 1.8791925937168008, + "grad_norm": 0.6334168910980225, + "learning_rate": 6.871013465627215e-05, + "loss": 1.5592, + "step": 6191 + }, + { + "epoch": 1.879496129913492, + "grad_norm": 0.48108720779418945, + "learning_rate": 6.870507239040194e-05, + "loss": 1.3368, + "step": 6192 + }, + { + "epoch": 1.8797996661101837, + "grad_norm": 0.45311546325683594, + "learning_rate": 6.870001012453175e-05, + "loss": 1.7126, + "step": 6193 + }, + { + "epoch": 1.880103202306875, + "grad_norm": 0.5576416850090027, + "learning_rate": 6.869494785866154e-05, + "loss": 1.7894, + "step": 6194 + }, + { + "epoch": 1.8804067385035665, + "grad_norm": 0.6543675661087036, + "learning_rate": 6.868988559279134e-05, + "loss": 1.4302, + "step": 6195 + }, + { + "epoch": 1.880710274700258, + "grad_norm": 0.592578113079071, + "learning_rate": 6.868482332692113e-05, + "loss": 1.9324, + "step": 6196 + }, + { + "epoch": 1.8810138108969494, + "grad_norm": 0.4866163432598114, + "learning_rate": 6.867976106105093e-05, + "loss": 1.7496, + "step": 6197 + }, + { + "epoch": 1.881317347093641, + "grad_norm": 0.6053670048713684, + "learning_rate": 6.867469879518072e-05, + "loss": 1.6693, + "step": 6198 + }, + { + "epoch": 1.8816208832903323, + "grad_norm": 0.5006988048553467, + "learning_rate": 6.866963652931052e-05, + "loss": 1.5501, + "step": 6199 + }, + { + "epoch": 1.881924419487024, + "grad_norm": 0.5762816667556763, + "learning_rate": 6.866457426344031e-05, + "loss": 1.7895, + "step": 6200 + }, + { + "epoch": 1.8822279556837151, + "grad_norm": 0.6253973245620728, + "learning_rate": 6.865951199757011e-05, + "loss": 1.4188, + "step": 6201 + }, + { + "epoch": 1.8825314918804068, + "grad_norm": 0.6388218998908997, + "learning_rate": 6.865444973169992e-05, + "loss": 2.0189, + "step": 6202 + }, + { + "epoch": 1.8828350280770982, + "grad_norm": 0.45942676067352295, + "learning_rate": 6.864938746582971e-05, + "loss": 1.8114, + "step": 6203 + }, + { + "epoch": 1.8831385642737897, + "grad_norm": 0.5745331645011902, + "learning_rate": 6.86443251999595e-05, + "loss": 1.6943, + "step": 6204 + }, + { + "epoch": 1.883442100470481, + "grad_norm": 0.6693764925003052, + "learning_rate": 6.86392629340893e-05, + "loss": 1.8772, + "step": 6205 + }, + { + "epoch": 1.8837456366671725, + "grad_norm": 0.5581064820289612, + "learning_rate": 6.86342006682191e-05, + "loss": 1.3722, + "step": 6206 + }, + { + "epoch": 1.884049172863864, + "grad_norm": 0.8228082656860352, + "learning_rate": 6.862913840234889e-05, + "loss": 1.8403, + "step": 6207 + }, + { + "epoch": 1.8843527090605554, + "grad_norm": 0.5933247804641724, + "learning_rate": 6.862407613647869e-05, + "loss": 2.0294, + "step": 6208 + }, + { + "epoch": 1.884656245257247, + "grad_norm": 0.5767022967338562, + "learning_rate": 6.861901387060848e-05, + "loss": 1.4142, + "step": 6209 + }, + { + "epoch": 1.8849597814539383, + "grad_norm": 0.4476047158241272, + "learning_rate": 6.861395160473827e-05, + "loss": 1.3916, + "step": 6210 + }, + { + "epoch": 1.88526331765063, + "grad_norm": 0.5437102913856506, + "learning_rate": 6.860888933886808e-05, + "loss": 1.7854, + "step": 6211 + }, + { + "epoch": 1.8855668538473211, + "grad_norm": 0.5073631405830383, + "learning_rate": 6.860382707299788e-05, + "loss": 1.5508, + "step": 6212 + }, + { + "epoch": 1.8858703900440128, + "grad_norm": 0.4325421154499054, + "learning_rate": 6.859876480712767e-05, + "loss": 1.5213, + "step": 6213 + }, + { + "epoch": 1.8861739262407042, + "grad_norm": 0.6551002264022827, + "learning_rate": 6.859370254125747e-05, + "loss": 1.7269, + "step": 6214 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.5482550859451294, + "learning_rate": 6.858864027538726e-05, + "loss": 1.3621, + "step": 6215 + }, + { + "epoch": 1.886780998634087, + "grad_norm": 0.5425487160682678, + "learning_rate": 6.858357800951706e-05, + "loss": 1.8515, + "step": 6216 + }, + { + "epoch": 1.8870845348307785, + "grad_norm": 0.5646753311157227, + "learning_rate": 6.857851574364687e-05, + "loss": 1.8606, + "step": 6217 + }, + { + "epoch": 1.88738807102747, + "grad_norm": 0.6172170639038086, + "learning_rate": 6.857345347777666e-05, + "loss": 1.7505, + "step": 6218 + }, + { + "epoch": 1.8876916072241614, + "grad_norm": 0.555770754814148, + "learning_rate": 6.856839121190646e-05, + "loss": 1.4309, + "step": 6219 + }, + { + "epoch": 1.887995143420853, + "grad_norm": 0.6966986656188965, + "learning_rate": 6.856332894603625e-05, + "loss": 1.5179, + "step": 6220 + }, + { + "epoch": 1.8882986796175443, + "grad_norm": 0.5168276429176331, + "learning_rate": 6.855826668016604e-05, + "loss": 1.7389, + "step": 6221 + }, + { + "epoch": 1.888602215814236, + "grad_norm": 0.5563384294509888, + "learning_rate": 6.855320441429585e-05, + "loss": 1.6291, + "step": 6222 + }, + { + "epoch": 1.8889057520109271, + "grad_norm": 0.6013602614402771, + "learning_rate": 6.854814214842565e-05, + "loss": 1.61, + "step": 6223 + }, + { + "epoch": 1.8892092882076188, + "grad_norm": 0.5379372835159302, + "learning_rate": 6.854307988255544e-05, + "loss": 1.6609, + "step": 6224 + }, + { + "epoch": 1.8895128244043102, + "grad_norm": 0.5397657752037048, + "learning_rate": 6.853801761668524e-05, + "loss": 1.4849, + "step": 6225 + }, + { + "epoch": 1.8898163606010017, + "grad_norm": 0.5665563941001892, + "learning_rate": 6.853295535081503e-05, + "loss": 1.8491, + "step": 6226 + }, + { + "epoch": 1.890119896797693, + "grad_norm": 0.46644145250320435, + "learning_rate": 6.852789308494483e-05, + "loss": 1.875, + "step": 6227 + }, + { + "epoch": 1.8904234329943845, + "grad_norm": 0.5429653525352478, + "learning_rate": 6.852283081907462e-05, + "loss": 1.9378, + "step": 6228 + }, + { + "epoch": 1.8907269691910762, + "grad_norm": 0.5680977702140808, + "learning_rate": 6.851776855320442e-05, + "loss": 1.6695, + "step": 6229 + }, + { + "epoch": 1.8910305053877674, + "grad_norm": 0.5593467354774475, + "learning_rate": 6.851270628733421e-05, + "loss": 1.6577, + "step": 6230 + }, + { + "epoch": 1.891334041584459, + "grad_norm": 0.57649165391922, + "learning_rate": 6.8507644021464e-05, + "loss": 1.7832, + "step": 6231 + }, + { + "epoch": 1.8916375777811503, + "grad_norm": 0.6322312355041504, + "learning_rate": 6.850258175559381e-05, + "loss": 1.7039, + "step": 6232 + }, + { + "epoch": 1.891941113977842, + "grad_norm": 0.5910970568656921, + "learning_rate": 6.849751948972361e-05, + "loss": 1.4358, + "step": 6233 + }, + { + "epoch": 1.8922446501745334, + "grad_norm": 0.5287604331970215, + "learning_rate": 6.84924572238534e-05, + "loss": 1.749, + "step": 6234 + }, + { + "epoch": 1.8925481863712248, + "grad_norm": 0.6745063066482544, + "learning_rate": 6.84873949579832e-05, + "loss": 1.4466, + "step": 6235 + }, + { + "epoch": 1.8928517225679162, + "grad_norm": 0.5758655071258545, + "learning_rate": 6.8482332692113e-05, + "loss": 1.6522, + "step": 6236 + }, + { + "epoch": 1.8931552587646077, + "grad_norm": 0.5966370105743408, + "learning_rate": 6.847727042624279e-05, + "loss": 1.7655, + "step": 6237 + }, + { + "epoch": 1.893458794961299, + "grad_norm": 0.6047906279563904, + "learning_rate": 6.847220816037258e-05, + "loss": 1.6902, + "step": 6238 + }, + { + "epoch": 1.8937623311579905, + "grad_norm": 0.553486168384552, + "learning_rate": 6.846714589450238e-05, + "loss": 1.7621, + "step": 6239 + }, + { + "epoch": 1.8940658673546822, + "grad_norm": 0.6047108769416809, + "learning_rate": 6.846208362863217e-05, + "loss": 1.8659, + "step": 6240 + }, + { + "epoch": 1.8943694035513734, + "grad_norm": 0.7074782252311707, + "learning_rate": 6.845702136276198e-05, + "loss": 1.7276, + "step": 6241 + }, + { + "epoch": 1.894672939748065, + "grad_norm": 0.5298722386360168, + "learning_rate": 6.845195909689178e-05, + "loss": 1.8257, + "step": 6242 + }, + { + "epoch": 1.8949764759447563, + "grad_norm": 0.4784451723098755, + "learning_rate": 6.844689683102157e-05, + "loss": 1.8273, + "step": 6243 + }, + { + "epoch": 1.895280012141448, + "grad_norm": 0.7199196219444275, + "learning_rate": 6.844183456515137e-05, + "loss": 1.9046, + "step": 6244 + }, + { + "epoch": 1.8955835483381394, + "grad_norm": 0.5387197136878967, + "learning_rate": 6.843677229928116e-05, + "loss": 1.5792, + "step": 6245 + }, + { + "epoch": 1.8958870845348308, + "grad_norm": 0.5584871172904968, + "learning_rate": 6.843171003341096e-05, + "loss": 1.5019, + "step": 6246 + }, + { + "epoch": 1.8961906207315222, + "grad_norm": 0.5552901029586792, + "learning_rate": 6.842664776754075e-05, + "loss": 1.7851, + "step": 6247 + }, + { + "epoch": 1.8964941569282137, + "grad_norm": 0.49852436780929565, + "learning_rate": 6.842158550167054e-05, + "loss": 1.2391, + "step": 6248 + }, + { + "epoch": 1.896797693124905, + "grad_norm": 0.6677296161651611, + "learning_rate": 6.841652323580034e-05, + "loss": 1.8005, + "step": 6249 + }, + { + "epoch": 1.8971012293215965, + "grad_norm": 0.5572288632392883, + "learning_rate": 6.841146096993015e-05, + "loss": 1.7112, + "step": 6250 + }, + { + "epoch": 1.8974047655182882, + "grad_norm": 0.4943527579307556, + "learning_rate": 6.840639870405994e-05, + "loss": 1.6392, + "step": 6251 + }, + { + "epoch": 1.8977083017149794, + "grad_norm": 0.4942340552806854, + "learning_rate": 6.840133643818974e-05, + "loss": 1.7768, + "step": 6252 + }, + { + "epoch": 1.898011837911671, + "grad_norm": 0.42028114199638367, + "learning_rate": 6.839627417231953e-05, + "loss": 1.2282, + "step": 6253 + }, + { + "epoch": 1.8983153741083623, + "grad_norm": 0.5391364693641663, + "learning_rate": 6.839121190644933e-05, + "loss": 1.3302, + "step": 6254 + }, + { + "epoch": 1.898618910305054, + "grad_norm": 0.4264262616634369, + "learning_rate": 6.838614964057912e-05, + "loss": 2.3301, + "step": 6255 + }, + { + "epoch": 1.8989224465017454, + "grad_norm": 0.39624980092048645, + "learning_rate": 6.838108737470892e-05, + "loss": 1.2059, + "step": 6256 + }, + { + "epoch": 1.8992259826984368, + "grad_norm": 0.5087313055992126, + "learning_rate": 6.837602510883871e-05, + "loss": 1.4168, + "step": 6257 + }, + { + "epoch": 1.8995295188951282, + "grad_norm": 0.5290665030479431, + "learning_rate": 6.83709628429685e-05, + "loss": 1.9055, + "step": 6258 + }, + { + "epoch": 1.8998330550918197, + "grad_norm": 0.5578790903091431, + "learning_rate": 6.83659005770983e-05, + "loss": 1.6796, + "step": 6259 + }, + { + "epoch": 1.9001365912885113, + "grad_norm": 0.47037312388420105, + "learning_rate": 6.836083831122811e-05, + "loss": 1.3823, + "step": 6260 + }, + { + "epoch": 1.9004401274852025, + "grad_norm": 0.4832285940647125, + "learning_rate": 6.835577604535792e-05, + "loss": 1.87, + "step": 6261 + }, + { + "epoch": 1.9007436636818942, + "grad_norm": 0.553207516670227, + "learning_rate": 6.835071377948771e-05, + "loss": 1.681, + "step": 6262 + }, + { + "epoch": 1.9010471998785854, + "grad_norm": 0.5976235270500183, + "learning_rate": 6.834565151361751e-05, + "loss": 1.5391, + "step": 6263 + }, + { + "epoch": 1.901350736075277, + "grad_norm": 0.6056948304176331, + "learning_rate": 6.83405892477473e-05, + "loss": 1.565, + "step": 6264 + }, + { + "epoch": 1.9016542722719685, + "grad_norm": 0.4596242308616638, + "learning_rate": 6.83355269818771e-05, + "loss": 1.7108, + "step": 6265 + }, + { + "epoch": 1.90195780846866, + "grad_norm": 0.7423454523086548, + "learning_rate": 6.833046471600689e-05, + "loss": 1.3533, + "step": 6266 + }, + { + "epoch": 1.9022613446653514, + "grad_norm": 0.6196095943450928, + "learning_rate": 6.832540245013669e-05, + "loss": 2.036, + "step": 6267 + }, + { + "epoch": 1.9025648808620428, + "grad_norm": 0.48350366950035095, + "learning_rate": 6.832034018426648e-05, + "loss": 1.2346, + "step": 6268 + }, + { + "epoch": 1.9028684170587342, + "grad_norm": 0.4920559823513031, + "learning_rate": 6.831527791839628e-05, + "loss": 1.7805, + "step": 6269 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.6937544345855713, + "learning_rate": 6.831021565252607e-05, + "loss": 1.9112, + "step": 6270 + }, + { + "epoch": 1.9034754894521173, + "grad_norm": 0.5743443965911865, + "learning_rate": 6.830515338665588e-05, + "loss": 1.669, + "step": 6271 + }, + { + "epoch": 1.9037790256488085, + "grad_norm": 0.5362838506698608, + "learning_rate": 6.830009112078567e-05, + "loss": 1.7985, + "step": 6272 + }, + { + "epoch": 1.9040825618455002, + "grad_norm": 0.5269457101821899, + "learning_rate": 6.829502885491547e-05, + "loss": 1.1284, + "step": 6273 + }, + { + "epoch": 1.9043860980421914, + "grad_norm": 0.6071358323097229, + "learning_rate": 6.828996658904526e-05, + "loss": 1.7948, + "step": 6274 + }, + { + "epoch": 1.904689634238883, + "grad_norm": 0.5875253081321716, + "learning_rate": 6.828490432317506e-05, + "loss": 1.4865, + "step": 6275 + }, + { + "epoch": 1.9049931704355745, + "grad_norm": 0.6144857406616211, + "learning_rate": 6.827984205730485e-05, + "loss": 1.2758, + "step": 6276 + }, + { + "epoch": 1.905296706632266, + "grad_norm": 0.6165915727615356, + "learning_rate": 6.827477979143465e-05, + "loss": 1.857, + "step": 6277 + }, + { + "epoch": 1.9056002428289573, + "grad_norm": 0.5990175604820251, + "learning_rate": 6.826971752556444e-05, + "loss": 1.67, + "step": 6278 + }, + { + "epoch": 1.9059037790256488, + "grad_norm": 0.5738477110862732, + "learning_rate": 6.826465525969424e-05, + "loss": 1.6268, + "step": 6279 + }, + { + "epoch": 1.9062073152223402, + "grad_norm": 0.4280742406845093, + "learning_rate": 6.825959299382405e-05, + "loss": 1.8544, + "step": 6280 + }, + { + "epoch": 1.9065108514190316, + "grad_norm": 0.4357243776321411, + "learning_rate": 6.825453072795384e-05, + "loss": 1.2586, + "step": 6281 + }, + { + "epoch": 1.9068143876157233, + "grad_norm": 0.5700424909591675, + "learning_rate": 6.824946846208364e-05, + "loss": 1.7586, + "step": 6282 + }, + { + "epoch": 1.9071179238124145, + "grad_norm": 0.5744918584823608, + "learning_rate": 6.824440619621343e-05, + "loss": 1.8481, + "step": 6283 + }, + { + "epoch": 1.9074214600091062, + "grad_norm": 0.646240770816803, + "learning_rate": 6.823934393034323e-05, + "loss": 1.6283, + "step": 6284 + }, + { + "epoch": 1.9077249962057974, + "grad_norm": 0.5268108248710632, + "learning_rate": 6.823428166447302e-05, + "loss": 1.879, + "step": 6285 + }, + { + "epoch": 1.908028532402489, + "grad_norm": 0.4985029995441437, + "learning_rate": 6.822921939860281e-05, + "loss": 1.8827, + "step": 6286 + }, + { + "epoch": 1.9083320685991805, + "grad_norm": 0.38506224751472473, + "learning_rate": 6.822415713273261e-05, + "loss": 1.7014, + "step": 6287 + }, + { + "epoch": 1.908635604795872, + "grad_norm": 0.5269253253936768, + "learning_rate": 6.82190948668624e-05, + "loss": 1.7312, + "step": 6288 + }, + { + "epoch": 1.9089391409925633, + "grad_norm": 0.5004891753196716, + "learning_rate": 6.821403260099221e-05, + "loss": 1.6025, + "step": 6289 + }, + { + "epoch": 1.9092426771892548, + "grad_norm": 0.5361594557762146, + "learning_rate": 6.820897033512201e-05, + "loss": 1.45, + "step": 6290 + }, + { + "epoch": 1.9095462133859464, + "grad_norm": 0.5427488684654236, + "learning_rate": 6.82039080692518e-05, + "loss": 1.7416, + "step": 6291 + }, + { + "epoch": 1.9098497495826376, + "grad_norm": 0.5532050132751465, + "learning_rate": 6.81988458033816e-05, + "loss": 1.589, + "step": 6292 + }, + { + "epoch": 1.9101532857793293, + "grad_norm": 0.49807676672935486, + "learning_rate": 6.819378353751139e-05, + "loss": 1.2657, + "step": 6293 + }, + { + "epoch": 1.9104568219760205, + "grad_norm": 0.587239146232605, + "learning_rate": 6.818872127164119e-05, + "loss": 1.7507, + "step": 6294 + }, + { + "epoch": 1.9107603581727122, + "grad_norm": 0.5817277431488037, + "learning_rate": 6.818365900577098e-05, + "loss": 1.7011, + "step": 6295 + }, + { + "epoch": 1.9110638943694036, + "grad_norm": 0.46033892035484314, + "learning_rate": 6.817859673990078e-05, + "loss": 1.1022, + "step": 6296 + }, + { + "epoch": 1.911367430566095, + "grad_norm": 0.5857753753662109, + "learning_rate": 6.817353447403057e-05, + "loss": 1.7669, + "step": 6297 + }, + { + "epoch": 1.9116709667627865, + "grad_norm": 0.6346994638442993, + "learning_rate": 6.816847220816037e-05, + "loss": 1.5637, + "step": 6298 + }, + { + "epoch": 1.911974502959478, + "grad_norm": 0.605383038520813, + "learning_rate": 6.816340994229017e-05, + "loss": 1.529, + "step": 6299 + }, + { + "epoch": 1.9122780391561693, + "grad_norm": 0.47823747992515564, + "learning_rate": 6.815834767641997e-05, + "loss": 1.284, + "step": 6300 + }, + { + "epoch": 1.9125815753528608, + "grad_norm": 1.0050535202026367, + "learning_rate": 6.815328541054976e-05, + "loss": 1.7572, + "step": 6301 + }, + { + "epoch": 1.9128851115495524, + "grad_norm": 0.5828871726989746, + "learning_rate": 6.814822314467956e-05, + "loss": 1.6989, + "step": 6302 + }, + { + "epoch": 1.9131886477462436, + "grad_norm": 0.6212213039398193, + "learning_rate": 6.814316087880935e-05, + "loss": 1.7444, + "step": 6303 + }, + { + "epoch": 1.9134921839429353, + "grad_norm": 0.49428045749664307, + "learning_rate": 6.813809861293915e-05, + "loss": 1.5384, + "step": 6304 + }, + { + "epoch": 1.9137957201396265, + "grad_norm": 0.511870801448822, + "learning_rate": 6.813303634706894e-05, + "loss": 1.8753, + "step": 6305 + }, + { + "epoch": 1.9140992563363182, + "grad_norm": 0.5578843951225281, + "learning_rate": 6.812797408119875e-05, + "loss": 1.7222, + "step": 6306 + }, + { + "epoch": 1.9144027925330096, + "grad_norm": 0.4399265944957733, + "learning_rate": 6.812291181532855e-05, + "loss": 1.5536, + "step": 6307 + }, + { + "epoch": 1.914706328729701, + "grad_norm": 0.49018776416778564, + "learning_rate": 6.811784954945834e-05, + "loss": 1.847, + "step": 6308 + }, + { + "epoch": 1.9150098649263925, + "grad_norm": 0.514025866985321, + "learning_rate": 6.811278728358814e-05, + "loss": 2.0138, + "step": 6309 + }, + { + "epoch": 1.915313401123084, + "grad_norm": 0.48464658856391907, + "learning_rate": 6.810772501771794e-05, + "loss": 1.8323, + "step": 6310 + }, + { + "epoch": 1.9156169373197753, + "grad_norm": 0.4685630798339844, + "learning_rate": 6.810266275184774e-05, + "loss": 1.2155, + "step": 6311 + }, + { + "epoch": 1.9159204735164668, + "grad_norm": 0.5250114798545837, + "learning_rate": 6.809760048597753e-05, + "loss": 1.7666, + "step": 6312 + }, + { + "epoch": 1.9162240097131584, + "grad_norm": 0.5229688882827759, + "learning_rate": 6.809253822010733e-05, + "loss": 1.6904, + "step": 6313 + }, + { + "epoch": 1.9165275459098496, + "grad_norm": 0.6486567854881287, + "learning_rate": 6.808747595423712e-05, + "loss": 1.7136, + "step": 6314 + }, + { + "epoch": 1.9168310821065413, + "grad_norm": 0.8866993188858032, + "learning_rate": 6.808241368836692e-05, + "loss": 1.3946, + "step": 6315 + }, + { + "epoch": 1.9171346183032325, + "grad_norm": 0.5037923455238342, + "learning_rate": 6.807735142249671e-05, + "loss": 1.8012, + "step": 6316 + }, + { + "epoch": 1.9174381544999242, + "grad_norm": 0.5571567416191101, + "learning_rate": 6.807228915662651e-05, + "loss": 1.8662, + "step": 6317 + }, + { + "epoch": 1.9177416906966156, + "grad_norm": 0.5871730446815491, + "learning_rate": 6.80672268907563e-05, + "loss": 1.7789, + "step": 6318 + }, + { + "epoch": 1.918045226893307, + "grad_norm": 0.8165649175643921, + "learning_rate": 6.806216462488611e-05, + "loss": 1.7067, + "step": 6319 + }, + { + "epoch": 1.9183487630899985, + "grad_norm": 0.5025912523269653, + "learning_rate": 6.80571023590159e-05, + "loss": 1.6136, + "step": 6320 + }, + { + "epoch": 1.91865229928669, + "grad_norm": 0.5596361756324768, + "learning_rate": 6.80520400931457e-05, + "loss": 1.6783, + "step": 6321 + }, + { + "epoch": 1.9189558354833816, + "grad_norm": 0.6053299903869629, + "learning_rate": 6.80469778272755e-05, + "loss": 1.3395, + "step": 6322 + }, + { + "epoch": 1.9192593716800728, + "grad_norm": 0.5544418692588806, + "learning_rate": 6.804191556140529e-05, + "loss": 1.7102, + "step": 6323 + }, + { + "epoch": 1.9195629078767644, + "grad_norm": 0.7214632034301758, + "learning_rate": 6.803685329553508e-05, + "loss": 1.324, + "step": 6324 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.6079810857772827, + "learning_rate": 6.803179102966488e-05, + "loss": 1.7599, + "step": 6325 + }, + { + "epoch": 1.9201699802701473, + "grad_norm": 0.37600433826446533, + "learning_rate": 6.802672876379467e-05, + "loss": 1.2087, + "step": 6326 + }, + { + "epoch": 1.9204735164668387, + "grad_norm": 0.6091787815093994, + "learning_rate": 6.802166649792447e-05, + "loss": 1.469, + "step": 6327 + }, + { + "epoch": 1.9207770526635302, + "grad_norm": 0.5562729835510254, + "learning_rate": 6.801660423205428e-05, + "loss": 1.6253, + "step": 6328 + }, + { + "epoch": 1.9210805888602216, + "grad_norm": 0.6257612705230713, + "learning_rate": 6.801154196618407e-05, + "loss": 1.3446, + "step": 6329 + }, + { + "epoch": 1.921384125056913, + "grad_norm": 0.6591886878013611, + "learning_rate": 6.800647970031387e-05, + "loss": 1.8737, + "step": 6330 + }, + { + "epoch": 1.9216876612536045, + "grad_norm": 0.6528851985931396, + "learning_rate": 6.800141743444366e-05, + "loss": 1.6804, + "step": 6331 + }, + { + "epoch": 1.921991197450296, + "grad_norm": 0.6890894770622253, + "learning_rate": 6.799635516857346e-05, + "loss": 1.4509, + "step": 6332 + }, + { + "epoch": 1.9222947336469876, + "grad_norm": 0.8640413880348206, + "learning_rate": 6.799129290270325e-05, + "loss": 1.5481, + "step": 6333 + }, + { + "epoch": 1.9225982698436788, + "grad_norm": 0.747075617313385, + "learning_rate": 6.798623063683305e-05, + "loss": 2.102, + "step": 6334 + }, + { + "epoch": 1.9229018060403704, + "grad_norm": 0.5577446222305298, + "learning_rate": 6.798116837096284e-05, + "loss": 1.9817, + "step": 6335 + }, + { + "epoch": 1.9232053422370616, + "grad_norm": 0.4409145712852478, + "learning_rate": 6.797610610509264e-05, + "loss": 1.2139, + "step": 6336 + }, + { + "epoch": 1.9235088784337533, + "grad_norm": 0.5447250604629517, + "learning_rate": 6.797104383922243e-05, + "loss": 1.8243, + "step": 6337 + }, + { + "epoch": 1.9238124146304447, + "grad_norm": 0.5465307831764221, + "learning_rate": 6.796598157335224e-05, + "loss": 1.8265, + "step": 6338 + }, + { + "epoch": 1.9241159508271362, + "grad_norm": 0.5586651563644409, + "learning_rate": 6.796091930748203e-05, + "loss": 1.4372, + "step": 6339 + }, + { + "epoch": 1.9244194870238276, + "grad_norm": 0.44557085633277893, + "learning_rate": 6.795585704161183e-05, + "loss": 1.5344, + "step": 6340 + }, + { + "epoch": 1.924723023220519, + "grad_norm": 0.6220162510871887, + "learning_rate": 6.795079477574162e-05, + "loss": 1.6125, + "step": 6341 + }, + { + "epoch": 1.9250265594172105, + "grad_norm": 0.5200288891792297, + "learning_rate": 6.794573250987142e-05, + "loss": 1.4323, + "step": 6342 + }, + { + "epoch": 1.925330095613902, + "grad_norm": 0.6363093256950378, + "learning_rate": 6.794067024400121e-05, + "loss": 1.2421, + "step": 6343 + }, + { + "epoch": 1.9256336318105935, + "grad_norm": 0.6280449628829956, + "learning_rate": 6.793560797813101e-05, + "loss": 1.4025, + "step": 6344 + }, + { + "epoch": 1.9259371680072848, + "grad_norm": 0.5198882222175598, + "learning_rate": 6.79305457122608e-05, + "loss": 1.9888, + "step": 6345 + }, + { + "epoch": 1.9262407042039764, + "grad_norm": 0.5742619037628174, + "learning_rate": 6.79254834463906e-05, + "loss": 1.719, + "step": 6346 + }, + { + "epoch": 1.9265442404006676, + "grad_norm": 0.6260584592819214, + "learning_rate": 6.79204211805204e-05, + "loss": 1.6638, + "step": 6347 + }, + { + "epoch": 1.9268477765973593, + "grad_norm": 0.506345808506012, + "learning_rate": 6.79153589146502e-05, + "loss": 1.5784, + "step": 6348 + }, + { + "epoch": 1.9271513127940507, + "grad_norm": 0.6416305303573608, + "learning_rate": 6.791029664878e-05, + "loss": 1.7515, + "step": 6349 + }, + { + "epoch": 1.9274548489907422, + "grad_norm": 0.5155960917472839, + "learning_rate": 6.79052343829098e-05, + "loss": 1.9706, + "step": 6350 + }, + { + "epoch": 1.9277583851874336, + "grad_norm": 0.6320833563804626, + "learning_rate": 6.79001721170396e-05, + "loss": 1.687, + "step": 6351 + }, + { + "epoch": 1.928061921384125, + "grad_norm": 0.5683110356330872, + "learning_rate": 6.789510985116939e-05, + "loss": 1.7174, + "step": 6352 + }, + { + "epoch": 1.9283654575808167, + "grad_norm": 0.5914602875709534, + "learning_rate": 6.789004758529919e-05, + "loss": 1.3214, + "step": 6353 + }, + { + "epoch": 1.928668993777508, + "grad_norm": 0.7478408217430115, + "learning_rate": 6.788498531942898e-05, + "loss": 1.6696, + "step": 6354 + }, + { + "epoch": 1.9289725299741995, + "grad_norm": 0.5382063984870911, + "learning_rate": 6.787992305355878e-05, + "loss": 1.8426, + "step": 6355 + }, + { + "epoch": 1.9292760661708908, + "grad_norm": 0.49428850412368774, + "learning_rate": 6.787486078768857e-05, + "loss": 1.6634, + "step": 6356 + }, + { + "epoch": 1.9295796023675824, + "grad_norm": 1.0222886800765991, + "learning_rate": 6.786979852181837e-05, + "loss": 1.7633, + "step": 6357 + }, + { + "epoch": 1.9298831385642736, + "grad_norm": 0.5247024297714233, + "learning_rate": 6.786473625594818e-05, + "loss": 1.2287, + "step": 6358 + }, + { + "epoch": 1.9301866747609653, + "grad_norm": 0.6638110876083374, + "learning_rate": 6.785967399007797e-05, + "loss": 1.6572, + "step": 6359 + }, + { + "epoch": 1.9304902109576567, + "grad_norm": 0.7600938677787781, + "learning_rate": 6.785461172420777e-05, + "loss": 1.7869, + "step": 6360 + }, + { + "epoch": 1.9307937471543482, + "grad_norm": 0.4435073435306549, + "learning_rate": 6.784954945833756e-05, + "loss": 1.3247, + "step": 6361 + }, + { + "epoch": 1.9310972833510396, + "grad_norm": 0.5299533605575562, + "learning_rate": 6.784448719246735e-05, + "loss": 2.0236, + "step": 6362 + }, + { + "epoch": 1.931400819547731, + "grad_norm": 0.6100825071334839, + "learning_rate": 6.783942492659715e-05, + "loss": 1.3755, + "step": 6363 + }, + { + "epoch": 1.9317043557444227, + "grad_norm": 0.5790150761604309, + "learning_rate": 6.783436266072694e-05, + "loss": 1.3591, + "step": 6364 + }, + { + "epoch": 1.9320078919411139, + "grad_norm": 0.5754750967025757, + "learning_rate": 6.782930039485674e-05, + "loss": 2.0173, + "step": 6365 + }, + { + "epoch": 1.9323114281378055, + "grad_norm": 0.6932041049003601, + "learning_rate": 6.782423812898653e-05, + "loss": 1.8836, + "step": 6366 + }, + { + "epoch": 1.9326149643344968, + "grad_norm": 0.5518671274185181, + "learning_rate": 6.781917586311634e-05, + "loss": 1.9421, + "step": 6367 + }, + { + "epoch": 1.9329185005311884, + "grad_norm": 0.5681519508361816, + "learning_rate": 6.781411359724614e-05, + "loss": 1.5273, + "step": 6368 + }, + { + "epoch": 1.9332220367278798, + "grad_norm": 0.47271469235420227, + "learning_rate": 6.780905133137593e-05, + "loss": 1.1639, + "step": 6369 + }, + { + "epoch": 1.9335255729245713, + "grad_norm": 0.5806103348731995, + "learning_rate": 6.780398906550573e-05, + "loss": 1.4392, + "step": 6370 + }, + { + "epoch": 1.9338291091212627, + "grad_norm": 0.5765122771263123, + "learning_rate": 6.779892679963552e-05, + "loss": 1.2515, + "step": 6371 + }, + { + "epoch": 1.9341326453179541, + "grad_norm": 0.42490893602371216, + "learning_rate": 6.779386453376532e-05, + "loss": 1.6461, + "step": 6372 + }, + { + "epoch": 1.9344361815146456, + "grad_norm": 0.5468345284461975, + "learning_rate": 6.778880226789511e-05, + "loss": 1.4913, + "step": 6373 + }, + { + "epoch": 1.934739717711337, + "grad_norm": 0.5540151596069336, + "learning_rate": 6.77837400020249e-05, + "loss": 1.6999, + "step": 6374 + }, + { + "epoch": 1.9350432539080287, + "grad_norm": 0.5978628396987915, + "learning_rate": 6.77786777361547e-05, + "loss": 1.5276, + "step": 6375 + }, + { + "epoch": 1.9353467901047199, + "grad_norm": 0.6584348678588867, + "learning_rate": 6.77736154702845e-05, + "loss": 1.2151, + "step": 6376 + }, + { + "epoch": 1.9356503263014115, + "grad_norm": 0.5471230149269104, + "learning_rate": 6.77685532044143e-05, + "loss": 1.4346, + "step": 6377 + }, + { + "epoch": 1.9359538624981028, + "grad_norm": 0.5572274327278137, + "learning_rate": 6.77634909385441e-05, + "loss": 1.3116, + "step": 6378 + }, + { + "epoch": 1.9362573986947944, + "grad_norm": 0.5694922804832458, + "learning_rate": 6.77584286726739e-05, + "loss": 2.0379, + "step": 6379 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.4971182346343994, + "learning_rate": 6.775336640680369e-05, + "loss": 1.7887, + "step": 6380 + }, + { + "epoch": 1.9368644710881773, + "grad_norm": 0.4632358253002167, + "learning_rate": 6.774830414093348e-05, + "loss": 2.0526, + "step": 6381 + }, + { + "epoch": 1.9371680072848687, + "grad_norm": 0.6226524114608765, + "learning_rate": 6.774324187506328e-05, + "loss": 1.2819, + "step": 6382 + }, + { + "epoch": 1.9374715434815601, + "grad_norm": 0.46665677428245544, + "learning_rate": 6.773817960919307e-05, + "loss": 1.7702, + "step": 6383 + }, + { + "epoch": 1.9377750796782516, + "grad_norm": 0.5216962099075317, + "learning_rate": 6.773311734332287e-05, + "loss": 1.635, + "step": 6384 + }, + { + "epoch": 1.938078615874943, + "grad_norm": 0.5371339321136475, + "learning_rate": 6.772805507745266e-05, + "loss": 1.804, + "step": 6385 + }, + { + "epoch": 1.9383821520716347, + "grad_norm": 0.5909997820854187, + "learning_rate": 6.772299281158247e-05, + "loss": 1.5609, + "step": 6386 + }, + { + "epoch": 1.9386856882683259, + "grad_norm": 0.5082078576087952, + "learning_rate": 6.771793054571227e-05, + "loss": 1.9097, + "step": 6387 + }, + { + "epoch": 1.9389892244650175, + "grad_norm": 0.5267134308815002, + "learning_rate": 6.771286827984206e-05, + "loss": 1.7824, + "step": 6388 + }, + { + "epoch": 1.9392927606617087, + "grad_norm": 0.5663163065910339, + "learning_rate": 6.770780601397185e-05, + "loss": 1.5645, + "step": 6389 + }, + { + "epoch": 1.9395962968584004, + "grad_norm": 0.5970969796180725, + "learning_rate": 6.770274374810165e-05, + "loss": 1.8656, + "step": 6390 + }, + { + "epoch": 1.9398998330550918, + "grad_norm": 0.4860338270664215, + "learning_rate": 6.769768148223144e-05, + "loss": 1.4045, + "step": 6391 + }, + { + "epoch": 1.9402033692517833, + "grad_norm": 0.5907909274101257, + "learning_rate": 6.769261921636124e-05, + "loss": 1.6731, + "step": 6392 + }, + { + "epoch": 1.9405069054484747, + "grad_norm": 1.025832176208496, + "learning_rate": 6.768755695049103e-05, + "loss": 0.6762, + "step": 6393 + }, + { + "epoch": 1.9408104416451661, + "grad_norm": 0.5831971168518066, + "learning_rate": 6.768249468462083e-05, + "loss": 1.8978, + "step": 6394 + }, + { + "epoch": 1.9411139778418578, + "grad_norm": 0.6331748366355896, + "learning_rate": 6.767743241875064e-05, + "loss": 1.8449, + "step": 6395 + }, + { + "epoch": 1.941417514038549, + "grad_norm": 0.5442969799041748, + "learning_rate": 6.767237015288043e-05, + "loss": 1.5979, + "step": 6396 + }, + { + "epoch": 1.9417210502352407, + "grad_norm": 0.5305371880531311, + "learning_rate": 6.766730788701024e-05, + "loss": 1.314, + "step": 6397 + }, + { + "epoch": 1.9420245864319319, + "grad_norm": 0.5143059492111206, + "learning_rate": 6.766224562114004e-05, + "loss": 1.8672, + "step": 6398 + }, + { + "epoch": 1.9423281226286235, + "grad_norm": 0.538637638092041, + "learning_rate": 6.765718335526983e-05, + "loss": 1.6072, + "step": 6399 + }, + { + "epoch": 1.942631658825315, + "grad_norm": 0.5820212364196777, + "learning_rate": 6.765212108939962e-05, + "loss": 1.7255, + "step": 6400 + }, + { + "epoch": 1.9429351950220064, + "grad_norm": 0.5563800930976868, + "learning_rate": 6.764705882352942e-05, + "loss": 1.8238, + "step": 6401 + }, + { + "epoch": 1.9432387312186978, + "grad_norm": 0.5390816330909729, + "learning_rate": 6.764199655765921e-05, + "loss": 1.6314, + "step": 6402 + }, + { + "epoch": 1.9435422674153893, + "grad_norm": 0.6193544268608093, + "learning_rate": 6.763693429178901e-05, + "loss": 1.4942, + "step": 6403 + }, + { + "epoch": 1.9438458036120807, + "grad_norm": 0.6250959634780884, + "learning_rate": 6.76318720259188e-05, + "loss": 1.6116, + "step": 6404 + }, + { + "epoch": 1.9441493398087721, + "grad_norm": 0.5631844997406006, + "learning_rate": 6.76268097600486e-05, + "loss": 1.6372, + "step": 6405 + }, + { + "epoch": 1.9444528760054638, + "grad_norm": 0.5537664890289307, + "learning_rate": 6.762174749417841e-05, + "loss": 1.7545, + "step": 6406 + }, + { + "epoch": 1.944756412202155, + "grad_norm": 0.537027895450592, + "learning_rate": 6.76166852283082e-05, + "loss": 1.3335, + "step": 6407 + }, + { + "epoch": 1.9450599483988467, + "grad_norm": 0.5764704346656799, + "learning_rate": 6.7611622962438e-05, + "loss": 1.6726, + "step": 6408 + }, + { + "epoch": 1.9453634845955379, + "grad_norm": 0.6139593124389648, + "learning_rate": 6.760656069656779e-05, + "loss": 1.6113, + "step": 6409 + }, + { + "epoch": 1.9456670207922295, + "grad_norm": 0.708470344543457, + "learning_rate": 6.760149843069759e-05, + "loss": 1.4301, + "step": 6410 + }, + { + "epoch": 1.945970556988921, + "grad_norm": 0.6561748385429382, + "learning_rate": 6.759643616482738e-05, + "loss": 1.8532, + "step": 6411 + }, + { + "epoch": 1.9462740931856124, + "grad_norm": 0.5118820667266846, + "learning_rate": 6.759137389895718e-05, + "loss": 1.7623, + "step": 6412 + }, + { + "epoch": 1.9465776293823038, + "grad_norm": 0.5191006064414978, + "learning_rate": 6.758631163308697e-05, + "loss": 1.3966, + "step": 6413 + }, + { + "epoch": 1.9468811655789953, + "grad_norm": 0.49863511323928833, + "learning_rate": 6.758124936721677e-05, + "loss": 1.7264, + "step": 6414 + }, + { + "epoch": 1.9471847017756867, + "grad_norm": 0.4555209279060364, + "learning_rate": 6.757618710134656e-05, + "loss": 1.7601, + "step": 6415 + }, + { + "epoch": 1.9474882379723781, + "grad_norm": 0.5262234210968018, + "learning_rate": 6.757112483547637e-05, + "loss": 1.7674, + "step": 6416 + }, + { + "epoch": 1.9477917741690698, + "grad_norm": 0.6079599857330322, + "learning_rate": 6.756606256960616e-05, + "loss": 1.4726, + "step": 6417 + }, + { + "epoch": 1.948095310365761, + "grad_norm": 0.6055933237075806, + "learning_rate": 6.756100030373596e-05, + "loss": 1.6495, + "step": 6418 + }, + { + "epoch": 1.9483988465624527, + "grad_norm": 0.5409154295921326, + "learning_rate": 6.755593803786575e-05, + "loss": 1.8415, + "step": 6419 + }, + { + "epoch": 1.9487023827591439, + "grad_norm": 0.5464493036270142, + "learning_rate": 6.755087577199555e-05, + "loss": 1.0245, + "step": 6420 + }, + { + "epoch": 1.9490059189558355, + "grad_norm": 0.8876932859420776, + "learning_rate": 6.754581350612534e-05, + "loss": 1.4808, + "step": 6421 + }, + { + "epoch": 1.949309455152527, + "grad_norm": 0.5546230673789978, + "learning_rate": 6.754075124025514e-05, + "loss": 1.6274, + "step": 6422 + }, + { + "epoch": 1.9496129913492184, + "grad_norm": 0.5752897262573242, + "learning_rate": 6.753568897438493e-05, + "loss": 1.7384, + "step": 6423 + }, + { + "epoch": 1.9499165275459098, + "grad_norm": 0.6077266931533813, + "learning_rate": 6.753062670851473e-05, + "loss": 1.5749, + "step": 6424 + }, + { + "epoch": 1.9502200637426013, + "grad_norm": 0.5324370265007019, + "learning_rate": 6.752556444264454e-05, + "loss": 1.6556, + "step": 6425 + }, + { + "epoch": 1.950523599939293, + "grad_norm": 0.5463990569114685, + "learning_rate": 6.752050217677433e-05, + "loss": 1.6159, + "step": 6426 + }, + { + "epoch": 1.9508271361359841, + "grad_norm": 0.5445590615272522, + "learning_rate": 6.751543991090412e-05, + "loss": 1.4407, + "step": 6427 + }, + { + "epoch": 1.9511306723326758, + "grad_norm": 0.5993614792823792, + "learning_rate": 6.751037764503392e-05, + "loss": 1.667, + "step": 6428 + }, + { + "epoch": 1.951434208529367, + "grad_norm": 0.5832435488700867, + "learning_rate": 6.750531537916371e-05, + "loss": 1.6842, + "step": 6429 + }, + { + "epoch": 1.9517377447260587, + "grad_norm": 0.6399338245391846, + "learning_rate": 6.750025311329351e-05, + "loss": 1.9275, + "step": 6430 + }, + { + "epoch": 1.95204128092275, + "grad_norm": 0.46948257088661194, + "learning_rate": 6.74951908474233e-05, + "loss": 1.4798, + "step": 6431 + }, + { + "epoch": 1.9523448171194415, + "grad_norm": 0.5000807642936707, + "learning_rate": 6.74901285815531e-05, + "loss": 0.8763, + "step": 6432 + }, + { + "epoch": 1.952648353316133, + "grad_norm": 0.7182363867759705, + "learning_rate": 6.74850663156829e-05, + "loss": 1.4422, + "step": 6433 + }, + { + "epoch": 1.9529518895128244, + "grad_norm": 0.7131654620170593, + "learning_rate": 6.74800040498127e-05, + "loss": 1.2321, + "step": 6434 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.7544083595275879, + "learning_rate": 6.74749417839425e-05, + "loss": 1.5929, + "step": 6435 + }, + { + "epoch": 1.9535589619062073, + "grad_norm": 0.5653949975967407, + "learning_rate": 6.746987951807229e-05, + "loss": 1.9116, + "step": 6436 + }, + { + "epoch": 1.953862498102899, + "grad_norm": 0.49180904030799866, + "learning_rate": 6.746481725220209e-05, + "loss": 2.1056, + "step": 6437 + }, + { + "epoch": 1.9541660342995901, + "grad_norm": 0.5516203045845032, + "learning_rate": 6.745975498633188e-05, + "loss": 1.6536, + "step": 6438 + }, + { + "epoch": 1.9544695704962818, + "grad_norm": 0.535835325717926, + "learning_rate": 6.745469272046169e-05, + "loss": 1.7121, + "step": 6439 + }, + { + "epoch": 1.954773106692973, + "grad_norm": 0.5963249802589417, + "learning_rate": 6.744963045459148e-05, + "loss": 1.4976, + "step": 6440 + }, + { + "epoch": 1.9550766428896647, + "grad_norm": 0.648929238319397, + "learning_rate": 6.744456818872128e-05, + "loss": 1.5205, + "step": 6441 + }, + { + "epoch": 1.955380179086356, + "grad_norm": 0.5600722432136536, + "learning_rate": 6.743950592285107e-05, + "loss": 1.6621, + "step": 6442 + }, + { + "epoch": 1.9556837152830475, + "grad_norm": 0.48909708857536316, + "learning_rate": 6.743444365698087e-05, + "loss": 1.7789, + "step": 6443 + }, + { + "epoch": 1.955987251479739, + "grad_norm": 0.5888108015060425, + "learning_rate": 6.742938139111066e-05, + "loss": 1.9655, + "step": 6444 + }, + { + "epoch": 1.9562907876764304, + "grad_norm": 0.5093645453453064, + "learning_rate": 6.742431912524047e-05, + "loss": 1.5964, + "step": 6445 + }, + { + "epoch": 1.9565943238731218, + "grad_norm": 0.4677153527736664, + "learning_rate": 6.741925685937027e-05, + "loss": 2.0216, + "step": 6446 + }, + { + "epoch": 1.9568978600698133, + "grad_norm": 0.6702057719230652, + "learning_rate": 6.741419459350006e-05, + "loss": 1.4916, + "step": 6447 + }, + { + "epoch": 1.957201396266505, + "grad_norm": 0.5236727595329285, + "learning_rate": 6.740913232762986e-05, + "loss": 1.6388, + "step": 6448 + }, + { + "epoch": 1.9575049324631961, + "grad_norm": 0.43825381994247437, + "learning_rate": 6.740407006175965e-05, + "loss": 1.6884, + "step": 6449 + }, + { + "epoch": 1.9578084686598878, + "grad_norm": 0.4706141948699951, + "learning_rate": 6.739900779588945e-05, + "loss": 1.2306, + "step": 6450 + }, + { + "epoch": 1.958112004856579, + "grad_norm": 0.5973210334777832, + "learning_rate": 6.739394553001924e-05, + "loss": 1.6573, + "step": 6451 + }, + { + "epoch": 1.9584155410532706, + "grad_norm": 0.6160237193107605, + "learning_rate": 6.738888326414904e-05, + "loss": 1.6356, + "step": 6452 + }, + { + "epoch": 1.958719077249962, + "grad_norm": 0.6227190494537354, + "learning_rate": 6.738382099827883e-05, + "loss": 1.4816, + "step": 6453 + }, + { + "epoch": 1.9590226134466535, + "grad_norm": 0.5531430840492249, + "learning_rate": 6.737875873240862e-05, + "loss": 1.7044, + "step": 6454 + }, + { + "epoch": 1.959326149643345, + "grad_norm": 0.4098502993583679, + "learning_rate": 6.737369646653843e-05, + "loss": 1.8336, + "step": 6455 + }, + { + "epoch": 1.9596296858400364, + "grad_norm": 0.6158850193023682, + "learning_rate": 6.736863420066823e-05, + "loss": 1.8795, + "step": 6456 + }, + { + "epoch": 1.959933222036728, + "grad_norm": 0.5197069644927979, + "learning_rate": 6.736357193479802e-05, + "loss": 1.515, + "step": 6457 + }, + { + "epoch": 1.9602367582334193, + "grad_norm": 0.7297940254211426, + "learning_rate": 6.735850966892782e-05, + "loss": 1.1881, + "step": 6458 + }, + { + "epoch": 1.960540294430111, + "grad_norm": 0.5615236163139343, + "learning_rate": 6.735344740305761e-05, + "loss": 1.4264, + "step": 6459 + }, + { + "epoch": 1.9608438306268021, + "grad_norm": 0.5689505934715271, + "learning_rate": 6.734838513718741e-05, + "loss": 1.8072, + "step": 6460 + }, + { + "epoch": 1.9611473668234938, + "grad_norm": 0.5976860523223877, + "learning_rate": 6.73433228713172e-05, + "loss": 1.6385, + "step": 6461 + }, + { + "epoch": 1.9614509030201852, + "grad_norm": 0.887874186038971, + "learning_rate": 6.7338260605447e-05, + "loss": 1.5963, + "step": 6462 + }, + { + "epoch": 1.9617544392168766, + "grad_norm": 0.5702905058860779, + "learning_rate": 6.733319833957679e-05, + "loss": 1.7066, + "step": 6463 + }, + { + "epoch": 1.962057975413568, + "grad_norm": 0.5714231729507446, + "learning_rate": 6.73281360737066e-05, + "loss": 1.6111, + "step": 6464 + }, + { + "epoch": 1.9623615116102595, + "grad_norm": 0.6091324687004089, + "learning_rate": 6.73230738078364e-05, + "loss": 1.6581, + "step": 6465 + }, + { + "epoch": 1.962665047806951, + "grad_norm": 0.5078444480895996, + "learning_rate": 6.731801154196619e-05, + "loss": 1.5205, + "step": 6466 + }, + { + "epoch": 1.9629685840036424, + "grad_norm": 0.5809574723243713, + "learning_rate": 6.731294927609598e-05, + "loss": 1.1441, + "step": 6467 + }, + { + "epoch": 1.963272120200334, + "grad_norm": 0.5350870490074158, + "learning_rate": 6.730788701022578e-05, + "loss": 1.5295, + "step": 6468 + }, + { + "epoch": 1.9635756563970252, + "grad_norm": 0.5390435457229614, + "learning_rate": 6.730282474435557e-05, + "loss": 1.764, + "step": 6469 + }, + { + "epoch": 1.963879192593717, + "grad_norm": 0.5699825882911682, + "learning_rate": 6.729776247848537e-05, + "loss": 1.7769, + "step": 6470 + }, + { + "epoch": 1.9641827287904081, + "grad_norm": 0.5871105194091797, + "learning_rate": 6.729270021261516e-05, + "loss": 1.7677, + "step": 6471 + }, + { + "epoch": 1.9644862649870998, + "grad_norm": 0.404573917388916, + "learning_rate": 6.728763794674496e-05, + "loss": 1.4699, + "step": 6472 + }, + { + "epoch": 1.9647898011837912, + "grad_norm": 0.6037812232971191, + "learning_rate": 6.728257568087477e-05, + "loss": 1.8687, + "step": 6473 + }, + { + "epoch": 1.9650933373804826, + "grad_norm": 0.5443028211593628, + "learning_rate": 6.727751341500456e-05, + "loss": 1.6255, + "step": 6474 + }, + { + "epoch": 1.965396873577174, + "grad_norm": 0.6080754399299622, + "learning_rate": 6.727245114913436e-05, + "loss": 1.898, + "step": 6475 + }, + { + "epoch": 1.9657004097738655, + "grad_norm": 0.597493588924408, + "learning_rate": 6.726738888326415e-05, + "loss": 1.8199, + "step": 6476 + }, + { + "epoch": 1.966003945970557, + "grad_norm": 1.2117207050323486, + "learning_rate": 6.726232661739395e-05, + "loss": 1.4137, + "step": 6477 + }, + { + "epoch": 1.9663074821672484, + "grad_norm": 0.5255807042121887, + "learning_rate": 6.725726435152374e-05, + "loss": 1.5809, + "step": 6478 + }, + { + "epoch": 1.96661101836394, + "grad_norm": 0.5825793147087097, + "learning_rate": 6.725220208565354e-05, + "loss": 1.7478, + "step": 6479 + }, + { + "epoch": 1.9669145545606312, + "grad_norm": 0.5194748044013977, + "learning_rate": 6.724713981978333e-05, + "loss": 1.7185, + "step": 6480 + }, + { + "epoch": 1.967218090757323, + "grad_norm": 0.6001968383789062, + "learning_rate": 6.724207755391312e-05, + "loss": 1.8425, + "step": 6481 + }, + { + "epoch": 1.9675216269540141, + "grad_norm": 0.4742978513240814, + "learning_rate": 6.723701528804292e-05, + "loss": 1.3781, + "step": 6482 + }, + { + "epoch": 1.9678251631507058, + "grad_norm": 0.5860036611557007, + "learning_rate": 6.723195302217273e-05, + "loss": 1.6749, + "step": 6483 + }, + { + "epoch": 1.9681286993473972, + "grad_norm": 0.5993077754974365, + "learning_rate": 6.722689075630254e-05, + "loss": 1.4344, + "step": 6484 + }, + { + "epoch": 1.9684322355440886, + "grad_norm": 0.6019352674484253, + "learning_rate": 6.722182849043233e-05, + "loss": 1.7294, + "step": 6485 + }, + { + "epoch": 1.96873577174078, + "grad_norm": 0.707388162612915, + "learning_rate": 6.721676622456213e-05, + "loss": 1.8304, + "step": 6486 + }, + { + "epoch": 1.9690393079374715, + "grad_norm": 0.5254133343696594, + "learning_rate": 6.721170395869192e-05, + "loss": 1.919, + "step": 6487 + }, + { + "epoch": 1.9693428441341632, + "grad_norm": 0.558387815952301, + "learning_rate": 6.720664169282172e-05, + "loss": 1.5288, + "step": 6488 + }, + { + "epoch": 1.9696463803308544, + "grad_norm": 0.5417835712432861, + "learning_rate": 6.720157942695151e-05, + "loss": 1.6305, + "step": 6489 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.49199047684669495, + "learning_rate": 6.71965171610813e-05, + "loss": 1.0437, + "step": 6490 + }, + { + "epoch": 1.9702534527242372, + "grad_norm": 0.5585491061210632, + "learning_rate": 6.71914548952111e-05, + "loss": 1.6227, + "step": 6491 + }, + { + "epoch": 1.970556988920929, + "grad_norm": 0.5286018252372742, + "learning_rate": 6.71863926293409e-05, + "loss": 1.765, + "step": 6492 + }, + { + "epoch": 1.97086052511762, + "grad_norm": 0.499729722738266, + "learning_rate": 6.718133036347069e-05, + "loss": 1.9407, + "step": 6493 + }, + { + "epoch": 1.9711640613143118, + "grad_norm": 0.6999737620353699, + "learning_rate": 6.71762680976005e-05, + "loss": 1.7973, + "step": 6494 + }, + { + "epoch": 1.9714675975110032, + "grad_norm": 0.5820497274398804, + "learning_rate": 6.717120583173029e-05, + "loss": 1.7509, + "step": 6495 + }, + { + "epoch": 1.9717711337076946, + "grad_norm": 0.6684516072273254, + "learning_rate": 6.716614356586009e-05, + "loss": 1.093, + "step": 6496 + }, + { + "epoch": 1.972074669904386, + "grad_norm": 0.5111326575279236, + "learning_rate": 6.716108129998988e-05, + "loss": 1.7832, + "step": 6497 + }, + { + "epoch": 1.9723782061010775, + "grad_norm": 0.5432667136192322, + "learning_rate": 6.715601903411968e-05, + "loss": 1.1834, + "step": 6498 + }, + { + "epoch": 1.9726817422977692, + "grad_norm": 0.4429357945919037, + "learning_rate": 6.715095676824947e-05, + "loss": 0.8361, + "step": 6499 + }, + { + "epoch": 1.9729852784944604, + "grad_norm": 0.6037774085998535, + "learning_rate": 6.714589450237927e-05, + "loss": 1.8509, + "step": 6500 + }, + { + "epoch": 1.973288814691152, + "grad_norm": 0.5203253030776978, + "learning_rate": 6.714083223650906e-05, + "loss": 1.4746, + "step": 6501 + }, + { + "epoch": 1.9735923508878432, + "grad_norm": 0.5598331093788147, + "learning_rate": 6.713576997063886e-05, + "loss": 1.904, + "step": 6502 + }, + { + "epoch": 1.973895887084535, + "grad_norm": 0.5714544653892517, + "learning_rate": 6.713070770476866e-05, + "loss": 1.3138, + "step": 6503 + }, + { + "epoch": 1.9741994232812263, + "grad_norm": 0.5742242932319641, + "learning_rate": 6.712564543889846e-05, + "loss": 1.5685, + "step": 6504 + }, + { + "epoch": 1.9745029594779178, + "grad_norm": 0.6703736782073975, + "learning_rate": 6.712058317302825e-05, + "loss": 1.4434, + "step": 6505 + }, + { + "epoch": 1.9748064956746092, + "grad_norm": 0.6143460869789124, + "learning_rate": 6.711552090715805e-05, + "loss": 1.3583, + "step": 6506 + }, + { + "epoch": 1.9751100318713006, + "grad_norm": 0.5895220637321472, + "learning_rate": 6.711045864128784e-05, + "loss": 1.8181, + "step": 6507 + }, + { + "epoch": 1.975413568067992, + "grad_norm": 0.5533620119094849, + "learning_rate": 6.710539637541764e-05, + "loss": 1.4437, + "step": 6508 + }, + { + "epoch": 1.9757171042646835, + "grad_norm": 0.45495548844337463, + "learning_rate": 6.710033410954743e-05, + "loss": 1.7565, + "step": 6509 + }, + { + "epoch": 1.9760206404613752, + "grad_norm": 0.5480602383613586, + "learning_rate": 6.709527184367723e-05, + "loss": 1.8036, + "step": 6510 + }, + { + "epoch": 1.9763241766580664, + "grad_norm": 0.6139658689498901, + "learning_rate": 6.709020957780702e-05, + "loss": 1.7937, + "step": 6511 + }, + { + "epoch": 1.976627712854758, + "grad_norm": 0.4904802739620209, + "learning_rate": 6.708514731193683e-05, + "loss": 1.9573, + "step": 6512 + }, + { + "epoch": 1.9769312490514492, + "grad_norm": 0.4706535339355469, + "learning_rate": 6.708008504606663e-05, + "loss": 1.6996, + "step": 6513 + }, + { + "epoch": 1.977234785248141, + "grad_norm": 0.5434771180152893, + "learning_rate": 6.707502278019642e-05, + "loss": 1.4273, + "step": 6514 + }, + { + "epoch": 1.9775383214448323, + "grad_norm": 0.5178016424179077, + "learning_rate": 6.706996051432622e-05, + "loss": 1.277, + "step": 6515 + }, + { + "epoch": 1.9778418576415238, + "grad_norm": 0.47218969464302063, + "learning_rate": 6.706489824845601e-05, + "loss": 1.122, + "step": 6516 + }, + { + "epoch": 1.9781453938382152, + "grad_norm": 0.5253738760948181, + "learning_rate": 6.70598359825858e-05, + "loss": 1.9643, + "step": 6517 + }, + { + "epoch": 1.9784489300349066, + "grad_norm": 0.5150578618049622, + "learning_rate": 6.70547737167156e-05, + "loss": 1.5839, + "step": 6518 + }, + { + "epoch": 1.9787524662315983, + "grad_norm": 0.5388327240943909, + "learning_rate": 6.70497114508454e-05, + "loss": 1.8581, + "step": 6519 + }, + { + "epoch": 1.9790560024282895, + "grad_norm": 0.518647313117981, + "learning_rate": 6.704464918497519e-05, + "loss": 1.9299, + "step": 6520 + }, + { + "epoch": 1.9793595386249812, + "grad_norm": 0.6723092198371887, + "learning_rate": 6.703958691910498e-05, + "loss": 1.8578, + "step": 6521 + }, + { + "epoch": 1.9796630748216724, + "grad_norm": 0.5758662819862366, + "learning_rate": 6.703452465323479e-05, + "loss": 1.6435, + "step": 6522 + }, + { + "epoch": 1.979966611018364, + "grad_norm": 0.6570977568626404, + "learning_rate": 6.702946238736459e-05, + "loss": 1.9321, + "step": 6523 + }, + { + "epoch": 1.9802701472150552, + "grad_norm": 0.5400984883308411, + "learning_rate": 6.702440012149438e-05, + "loss": 1.4454, + "step": 6524 + }, + { + "epoch": 1.9805736834117469, + "grad_norm": 0.48057571053504944, + "learning_rate": 6.701933785562418e-05, + "loss": 1.6747, + "step": 6525 + }, + { + "epoch": 1.9808772196084383, + "grad_norm": 0.6174799799919128, + "learning_rate": 6.701427558975397e-05, + "loss": 1.4885, + "step": 6526 + }, + { + "epoch": 1.9811807558051298, + "grad_norm": 0.6381711363792419, + "learning_rate": 6.700921332388377e-05, + "loss": 1.5634, + "step": 6527 + }, + { + "epoch": 1.9814842920018212, + "grad_norm": 0.5980601906776428, + "learning_rate": 6.700415105801358e-05, + "loss": 1.779, + "step": 6528 + }, + { + "epoch": 1.9817878281985126, + "grad_norm": 0.6087924242019653, + "learning_rate": 6.699908879214337e-05, + "loss": 1.3662, + "step": 6529 + }, + { + "epoch": 1.9820913643952043, + "grad_norm": 0.7204270958900452, + "learning_rate": 6.699402652627316e-05, + "loss": 1.7392, + "step": 6530 + }, + { + "epoch": 1.9823949005918955, + "grad_norm": 0.6271967887878418, + "learning_rate": 6.698896426040296e-05, + "loss": 1.9596, + "step": 6531 + }, + { + "epoch": 1.9826984367885871, + "grad_norm": 0.5030335783958435, + "learning_rate": 6.698390199453275e-05, + "loss": 1.6411, + "step": 6532 + }, + { + "epoch": 1.9830019729852784, + "grad_norm": 0.5594578385353088, + "learning_rate": 6.697883972866256e-05, + "loss": 1.2329, + "step": 6533 + }, + { + "epoch": 1.98330550918197, + "grad_norm": 0.6117578744888306, + "learning_rate": 6.697377746279236e-05, + "loss": 1.7061, + "step": 6534 + }, + { + "epoch": 1.9836090453786615, + "grad_norm": 0.44561445713043213, + "learning_rate": 6.696871519692215e-05, + "loss": 1.8048, + "step": 6535 + }, + { + "epoch": 1.9839125815753529, + "grad_norm": 0.49325069785118103, + "learning_rate": 6.696365293105195e-05, + "loss": 1.8344, + "step": 6536 + }, + { + "epoch": 1.9842161177720443, + "grad_norm": 0.42023026943206787, + "learning_rate": 6.695859066518174e-05, + "loss": 1.7417, + "step": 6537 + }, + { + "epoch": 1.9845196539687358, + "grad_norm": 0.5083653926849365, + "learning_rate": 6.695352839931154e-05, + "loss": 1.8918, + "step": 6538 + }, + { + "epoch": 1.9848231901654272, + "grad_norm": 0.5898207426071167, + "learning_rate": 6.694846613344133e-05, + "loss": 1.5339, + "step": 6539 + }, + { + "epoch": 1.9851267263621186, + "grad_norm": 0.622306764125824, + "learning_rate": 6.694340386757113e-05, + "loss": 1.5009, + "step": 6540 + }, + { + "epoch": 1.9854302625588103, + "grad_norm": 0.5618451237678528, + "learning_rate": 6.693834160170092e-05, + "loss": 1.235, + "step": 6541 + }, + { + "epoch": 1.9857337987555015, + "grad_norm": 0.5828057527542114, + "learning_rate": 6.693327933583073e-05, + "loss": 1.7947, + "step": 6542 + }, + { + "epoch": 1.9860373349521931, + "grad_norm": 0.4943867027759552, + "learning_rate": 6.692821706996052e-05, + "loss": 1.2251, + "step": 6543 + }, + { + "epoch": 1.9863408711488844, + "grad_norm": 0.5512354969978333, + "learning_rate": 6.692315480409032e-05, + "loss": 1.6636, + "step": 6544 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.579082190990448, + "learning_rate": 6.691809253822011e-05, + "loss": 1.8635, + "step": 6545 + }, + { + "epoch": 1.9869479435422674, + "grad_norm": 0.4855844974517822, + "learning_rate": 6.691303027234991e-05, + "loss": 1.0762, + "step": 6546 + }, + { + "epoch": 1.9872514797389589, + "grad_norm": 0.4621904194355011, + "learning_rate": 6.69079680064797e-05, + "loss": 1.5484, + "step": 6547 + }, + { + "epoch": 1.9875550159356503, + "grad_norm": 0.5971441268920898, + "learning_rate": 6.69029057406095e-05, + "loss": 1.1564, + "step": 6548 + }, + { + "epoch": 1.9878585521323417, + "grad_norm": 0.47844675183296204, + "learning_rate": 6.689784347473929e-05, + "loss": 1.773, + "step": 6549 + }, + { + "epoch": 1.9881620883290332, + "grad_norm": 0.47111570835113525, + "learning_rate": 6.689278120886909e-05, + "loss": 1.2417, + "step": 6550 + }, + { + "epoch": 1.9884656245257246, + "grad_norm": 0.6051010489463806, + "learning_rate": 6.68877189429989e-05, + "loss": 1.8236, + "step": 6551 + }, + { + "epoch": 1.9887691607224163, + "grad_norm": 0.7357580065727234, + "learning_rate": 6.688265667712869e-05, + "loss": 1.6381, + "step": 6552 + }, + { + "epoch": 1.9890726969191075, + "grad_norm": 0.866306483745575, + "learning_rate": 6.687759441125849e-05, + "loss": 1.2849, + "step": 6553 + }, + { + "epoch": 1.9893762331157991, + "grad_norm": 0.5960594415664673, + "learning_rate": 6.687253214538828e-05, + "loss": 1.3701, + "step": 6554 + }, + { + "epoch": 1.9896797693124904, + "grad_norm": 0.5979890823364258, + "learning_rate": 6.686746987951808e-05, + "loss": 2.0261, + "step": 6555 + }, + { + "epoch": 1.989983305509182, + "grad_norm": 0.47698384523391724, + "learning_rate": 6.686240761364787e-05, + "loss": 1.24, + "step": 6556 + }, + { + "epoch": 1.9902868417058734, + "grad_norm": 0.6178485751152039, + "learning_rate": 6.685734534777766e-05, + "loss": 1.8583, + "step": 6557 + }, + { + "epoch": 1.9905903779025649, + "grad_norm": 0.5533079504966736, + "learning_rate": 6.685228308190746e-05, + "loss": 0.9868, + "step": 6558 + }, + { + "epoch": 1.9908939140992563, + "grad_norm": 0.5186291337013245, + "learning_rate": 6.684722081603725e-05, + "loss": 1.8188, + "step": 6559 + }, + { + "epoch": 1.9911974502959477, + "grad_norm": 0.5277303457260132, + "learning_rate": 6.684215855016705e-05, + "loss": 1.4531, + "step": 6560 + }, + { + "epoch": 1.9915009864926394, + "grad_norm": 1.0453486442565918, + "learning_rate": 6.683709628429686e-05, + "loss": 1.4636, + "step": 6561 + }, + { + "epoch": 1.9918045226893306, + "grad_norm": 0.5569626688957214, + "learning_rate": 6.683203401842665e-05, + "loss": 1.9508, + "step": 6562 + }, + { + "epoch": 1.9921080588860223, + "grad_norm": 0.4960695207118988, + "learning_rate": 6.682697175255645e-05, + "loss": 1.5171, + "step": 6563 + }, + { + "epoch": 1.9924115950827135, + "grad_norm": 0.6133225560188293, + "learning_rate": 6.682190948668624e-05, + "loss": 1.5388, + "step": 6564 + }, + { + "epoch": 1.9927151312794051, + "grad_norm": 0.5678309202194214, + "learning_rate": 6.681684722081604e-05, + "loss": 1.7054, + "step": 6565 + }, + { + "epoch": 1.9930186674760966, + "grad_norm": 0.5518128275871277, + "learning_rate": 6.681178495494583e-05, + "loss": 1.6772, + "step": 6566 + }, + { + "epoch": 1.993322203672788, + "grad_norm": 0.5464813709259033, + "learning_rate": 6.680672268907563e-05, + "loss": 1.9509, + "step": 6567 + }, + { + "epoch": 1.9936257398694794, + "grad_norm": 0.4115614891052246, + "learning_rate": 6.680166042320542e-05, + "loss": 1.3763, + "step": 6568 + }, + { + "epoch": 1.9939292760661709, + "grad_norm": 0.5534042716026306, + "learning_rate": 6.679659815733522e-05, + "loss": 1.788, + "step": 6569 + }, + { + "epoch": 1.9942328122628623, + "grad_norm": 0.5049768686294556, + "learning_rate": 6.679153589146502e-05, + "loss": 1.435, + "step": 6570 + }, + { + "epoch": 1.9945363484595537, + "grad_norm": 0.5459719896316528, + "learning_rate": 6.678647362559482e-05, + "loss": 1.3551, + "step": 6571 + }, + { + "epoch": 1.9948398846562454, + "grad_norm": 0.9183335900306702, + "learning_rate": 6.678141135972461e-05, + "loss": 1.2186, + "step": 6572 + }, + { + "epoch": 1.9951434208529366, + "grad_norm": 0.5222302079200745, + "learning_rate": 6.677634909385442e-05, + "loss": 1.8529, + "step": 6573 + }, + { + "epoch": 1.9954469570496283, + "grad_norm": 0.5741526484489441, + "learning_rate": 6.677128682798422e-05, + "loss": 1.5536, + "step": 6574 + }, + { + "epoch": 1.9957504932463195, + "grad_norm": 0.5656235814094543, + "learning_rate": 6.676622456211401e-05, + "loss": 1.5544, + "step": 6575 + }, + { + "epoch": 1.9960540294430111, + "grad_norm": 0.4552244246006012, + "learning_rate": 6.67611622962438e-05, + "loss": 2.337, + "step": 6576 + }, + { + "epoch": 1.9963575656397026, + "grad_norm": 0.5700350999832153, + "learning_rate": 6.67561000303736e-05, + "loss": 1.525, + "step": 6577 + }, + { + "epoch": 1.996661101836394, + "grad_norm": 0.5289913415908813, + "learning_rate": 6.67510377645034e-05, + "loss": 1.3759, + "step": 6578 + }, + { + "epoch": 1.9969646380330854, + "grad_norm": 0.5340834259986877, + "learning_rate": 6.674597549863319e-05, + "loss": 1.2181, + "step": 6579 + }, + { + "epoch": 1.9972681742297769, + "grad_norm": 0.5501202940940857, + "learning_rate": 6.674091323276299e-05, + "loss": 1.5773, + "step": 6580 + }, + { + "epoch": 1.9975717104264683, + "grad_norm": 0.36932939291000366, + "learning_rate": 6.67358509668928e-05, + "loss": 1.3267, + "step": 6581 + }, + { + "epoch": 1.9978752466231597, + "grad_norm": 0.484678715467453, + "learning_rate": 6.673078870102259e-05, + "loss": 1.8134, + "step": 6582 + }, + { + "epoch": 1.9981787828198514, + "grad_norm": 0.5534485578536987, + "learning_rate": 6.672572643515238e-05, + "loss": 2.0146, + "step": 6583 + }, + { + "epoch": 1.9984823190165426, + "grad_norm": 0.602260410785675, + "learning_rate": 6.672066416928218e-05, + "loss": 1.8876, + "step": 6584 + }, + { + "epoch": 1.9987858552132343, + "grad_norm": 0.568967342376709, + "learning_rate": 6.671560190341197e-05, + "loss": 1.8811, + "step": 6585 + }, + { + "epoch": 1.9990893914099255, + "grad_norm": 0.500963032245636, + "learning_rate": 6.671053963754177e-05, + "loss": 1.2231, + "step": 6586 + }, + { + "epoch": 1.9993929276066171, + "grad_norm": 0.596053957939148, + "learning_rate": 6.670547737167156e-05, + "loss": 1.4158, + "step": 6587 + }, + { + "epoch": 1.9996964638033086, + "grad_norm": 0.6084225177764893, + "learning_rate": 6.670041510580136e-05, + "loss": 1.8643, + "step": 6588 + }, + { + "epoch": 2.0, + "grad_norm": 0.4858667254447937, + "learning_rate": 6.669535283993115e-05, + "loss": 1.4181, + "step": 6589 + }, + { + "epoch": 2.0003035361966917, + "grad_norm": 0.48385685682296753, + "learning_rate": 6.669029057406095e-05, + "loss": 1.5573, + "step": 6590 + }, + { + "epoch": 2.000607072393383, + "grad_norm": 0.5700175166130066, + "learning_rate": 6.668522830819076e-05, + "loss": 1.4403, + "step": 6591 + }, + { + "epoch": 2.0009106085900745, + "grad_norm": 0.6228484511375427, + "learning_rate": 6.668016604232055e-05, + "loss": 1.4034, + "step": 6592 + }, + { + "epoch": 2.0012141447867657, + "grad_norm": 0.5445995330810547, + "learning_rate": 6.667510377645035e-05, + "loss": 1.3197, + "step": 6593 + }, + { + "epoch": 2.0015176809834574, + "grad_norm": 0.6247600317001343, + "learning_rate": 6.667004151058014e-05, + "loss": 1.1154, + "step": 6594 + }, + { + "epoch": 2.0018212171801486, + "grad_norm": 0.6725988388061523, + "learning_rate": 6.666497924470993e-05, + "loss": 1.4879, + "step": 6595 + }, + { + "epoch": 2.0021247533768403, + "grad_norm": 0.7663455605506897, + "learning_rate": 6.665991697883973e-05, + "loss": 1.1032, + "step": 6596 + }, + { + "epoch": 2.0024282895735315, + "grad_norm": 1.1510711908340454, + "learning_rate": 6.665485471296952e-05, + "loss": 1.4117, + "step": 6597 + }, + { + "epoch": 2.002731825770223, + "grad_norm": 0.6524301767349243, + "learning_rate": 6.664979244709932e-05, + "loss": 1.0263, + "step": 6598 + }, + { + "epoch": 2.0030353619669143, + "grad_norm": 0.7597069144248962, + "learning_rate": 6.664473018122911e-05, + "loss": 1.3326, + "step": 6599 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.5784065127372742, + "learning_rate": 6.663966791535892e-05, + "loss": 1.2875, + "step": 6600 + }, + { + "epoch": 2.0036424343602977, + "grad_norm": 0.6139408946037292, + "learning_rate": 6.663460564948872e-05, + "loss": 0.7758, + "step": 6601 + }, + { + "epoch": 2.003945970556989, + "grad_norm": 0.6967280507087708, + "learning_rate": 6.662954338361851e-05, + "loss": 1.2892, + "step": 6602 + }, + { + "epoch": 2.0042495067536805, + "grad_norm": 0.6574037671089172, + "learning_rate": 6.66244811177483e-05, + "loss": 1.3883, + "step": 6603 + }, + { + "epoch": 2.0045530429503717, + "grad_norm": 0.6693050265312195, + "learning_rate": 6.66194188518781e-05, + "loss": 1.4671, + "step": 6604 + }, + { + "epoch": 2.0048565791470634, + "grad_norm": 0.9042626619338989, + "learning_rate": 6.66143565860079e-05, + "loss": 1.0162, + "step": 6605 + }, + { + "epoch": 2.0051601153437546, + "grad_norm": 0.7697750329971313, + "learning_rate": 6.660929432013769e-05, + "loss": 1.4319, + "step": 6606 + }, + { + "epoch": 2.0054636515404463, + "grad_norm": 0.7360553741455078, + "learning_rate": 6.660423205426749e-05, + "loss": 0.8861, + "step": 6607 + }, + { + "epoch": 2.0057671877371375, + "grad_norm": 0.7231805324554443, + "learning_rate": 6.659916978839728e-05, + "loss": 1.1145, + "step": 6608 + }, + { + "epoch": 2.006070723933829, + "grad_norm": 0.8397212624549866, + "learning_rate": 6.659410752252709e-05, + "loss": 1.2708, + "step": 6609 + }, + { + "epoch": 2.006374260130521, + "grad_norm": 0.5130758881568909, + "learning_rate": 6.658904525665688e-05, + "loss": 0.8484, + "step": 6610 + }, + { + "epoch": 2.006677796327212, + "grad_norm": 0.7962560653686523, + "learning_rate": 6.658398299078668e-05, + "loss": 1.3299, + "step": 6611 + }, + { + "epoch": 2.0069813325239036, + "grad_norm": 0.772659420967102, + "learning_rate": 6.657892072491647e-05, + "loss": 1.6294, + "step": 6612 + }, + { + "epoch": 2.007284868720595, + "grad_norm": 0.7391024827957153, + "learning_rate": 6.657385845904627e-05, + "loss": 1.0269, + "step": 6613 + }, + { + "epoch": 2.0075884049172865, + "grad_norm": 0.7813867330551147, + "learning_rate": 6.656879619317606e-05, + "loss": 1.1973, + "step": 6614 + }, + { + "epoch": 2.0078919411139777, + "grad_norm": 0.6954705715179443, + "learning_rate": 6.656373392730586e-05, + "loss": 1.5012, + "step": 6615 + }, + { + "epoch": 2.0081954773106694, + "grad_norm": 0.6593831777572632, + "learning_rate": 6.655867166143565e-05, + "loss": 0.9528, + "step": 6616 + }, + { + "epoch": 2.0084990135073606, + "grad_norm": 0.8076401948928833, + "learning_rate": 6.655360939556546e-05, + "loss": 1.0781, + "step": 6617 + }, + { + "epoch": 2.0088025497040523, + "grad_norm": 0.7507150769233704, + "learning_rate": 6.654854712969526e-05, + "loss": 1.4245, + "step": 6618 + }, + { + "epoch": 2.0091060859007435, + "grad_norm": 0.6943393349647522, + "learning_rate": 6.654348486382505e-05, + "loss": 1.224, + "step": 6619 + }, + { + "epoch": 2.009409622097435, + "grad_norm": 0.658205509185791, + "learning_rate": 6.653842259795486e-05, + "loss": 1.0013, + "step": 6620 + }, + { + "epoch": 2.009713158294127, + "grad_norm": 0.54278963804245, + "learning_rate": 6.653336033208465e-05, + "loss": 0.9834, + "step": 6621 + }, + { + "epoch": 2.010016694490818, + "grad_norm": 0.9639625549316406, + "learning_rate": 6.652829806621445e-05, + "loss": 1.5964, + "step": 6622 + }, + { + "epoch": 2.0103202306875096, + "grad_norm": 0.7107603549957275, + "learning_rate": 6.652323580034424e-05, + "loss": 1.1444, + "step": 6623 + }, + { + "epoch": 2.010623766884201, + "grad_norm": 0.7288581728935242, + "learning_rate": 6.651817353447404e-05, + "loss": 1.1682, + "step": 6624 + }, + { + "epoch": 2.0109273030808925, + "grad_norm": 0.7180545926094055, + "learning_rate": 6.651311126860383e-05, + "loss": 1.2499, + "step": 6625 + }, + { + "epoch": 2.0112308392775837, + "grad_norm": 0.514674961566925, + "learning_rate": 6.650804900273363e-05, + "loss": 1.2415, + "step": 6626 + }, + { + "epoch": 2.0115343754742754, + "grad_norm": 0.6022255420684814, + "learning_rate": 6.650298673686342e-05, + "loss": 1.4536, + "step": 6627 + }, + { + "epoch": 2.0118379116709666, + "grad_norm": 0.6236252784729004, + "learning_rate": 6.649792447099322e-05, + "loss": 0.9079, + "step": 6628 + }, + { + "epoch": 2.0121414478676583, + "grad_norm": 0.7158982157707214, + "learning_rate": 6.649286220512301e-05, + "loss": 1.4665, + "step": 6629 + }, + { + "epoch": 2.0124449840643495, + "grad_norm": 0.7587727308273315, + "learning_rate": 6.648779993925282e-05, + "loss": 1.2285, + "step": 6630 + }, + { + "epoch": 2.012748520261041, + "grad_norm": 0.7265990376472473, + "learning_rate": 6.648273767338262e-05, + "loss": 1.5214, + "step": 6631 + }, + { + "epoch": 2.0130520564577328, + "grad_norm": 0.862713634967804, + "learning_rate": 6.647767540751241e-05, + "loss": 1.0318, + "step": 6632 + }, + { + "epoch": 2.013355592654424, + "grad_norm": 1.1299731731414795, + "learning_rate": 6.64726131416422e-05, + "loss": 1.0554, + "step": 6633 + }, + { + "epoch": 2.0136591288511156, + "grad_norm": 0.7257562875747681, + "learning_rate": 6.6467550875772e-05, + "loss": 1.2273, + "step": 6634 + }, + { + "epoch": 2.013962665047807, + "grad_norm": 0.8512704372406006, + "learning_rate": 6.64624886099018e-05, + "loss": 1.5209, + "step": 6635 + }, + { + "epoch": 2.0142662012444985, + "grad_norm": 0.7467771768569946, + "learning_rate": 6.645742634403159e-05, + "loss": 1.5147, + "step": 6636 + }, + { + "epoch": 2.0145697374411897, + "grad_norm": 0.6188955903053284, + "learning_rate": 6.645236407816138e-05, + "loss": 1.3291, + "step": 6637 + }, + { + "epoch": 2.0148732736378814, + "grad_norm": 0.7535701394081116, + "learning_rate": 6.644730181229118e-05, + "loss": 1.4063, + "step": 6638 + }, + { + "epoch": 2.0151768098345726, + "grad_norm": 0.7806875705718994, + "learning_rate": 6.644223954642099e-05, + "loss": 1.2262, + "step": 6639 + }, + { + "epoch": 2.0154803460312642, + "grad_norm": 0.5792213082313538, + "learning_rate": 6.643717728055078e-05, + "loss": 0.9279, + "step": 6640 + }, + { + "epoch": 2.015783882227956, + "grad_norm": 0.8171066045761108, + "learning_rate": 6.643211501468058e-05, + "loss": 1.4051, + "step": 6641 + }, + { + "epoch": 2.016087418424647, + "grad_norm": 0.8089918494224548, + "learning_rate": 6.642705274881037e-05, + "loss": 1.146, + "step": 6642 + }, + { + "epoch": 2.0163909546213388, + "grad_norm": 0.6619982123374939, + "learning_rate": 6.642199048294017e-05, + "loss": 1.2122, + "step": 6643 + }, + { + "epoch": 2.01669449081803, + "grad_norm": 0.7962835431098938, + "learning_rate": 6.641692821706996e-05, + "loss": 1.2783, + "step": 6644 + }, + { + "epoch": 2.0169980270147216, + "grad_norm": 0.8901903033256531, + "learning_rate": 6.641186595119976e-05, + "loss": 1.2049, + "step": 6645 + }, + { + "epoch": 2.017301563211413, + "grad_norm": 0.7541537284851074, + "learning_rate": 6.640680368532955e-05, + "loss": 1.5408, + "step": 6646 + }, + { + "epoch": 2.0176050994081045, + "grad_norm": 0.7315836548805237, + "learning_rate": 6.640174141945935e-05, + "loss": 1.3899, + "step": 6647 + }, + { + "epoch": 2.0179086356047957, + "grad_norm": 0.7619051337242126, + "learning_rate": 6.639667915358915e-05, + "loss": 0.6086, + "step": 6648 + }, + { + "epoch": 2.0182121718014874, + "grad_norm": 0.6528330445289612, + "learning_rate": 6.639161688771895e-05, + "loss": 1.7007, + "step": 6649 + }, + { + "epoch": 2.0185157079981786, + "grad_norm": 0.8032832145690918, + "learning_rate": 6.638655462184874e-05, + "loss": 1.2065, + "step": 6650 + }, + { + "epoch": 2.0188192441948702, + "grad_norm": 0.8372535705566406, + "learning_rate": 6.638149235597854e-05, + "loss": 1.2597, + "step": 6651 + }, + { + "epoch": 2.019122780391562, + "grad_norm": 0.7499954700469971, + "learning_rate": 6.637643009010833e-05, + "loss": 0.9583, + "step": 6652 + }, + { + "epoch": 2.019426316588253, + "grad_norm": 0.6940346956253052, + "learning_rate": 6.637136782423813e-05, + "loss": 1.1369, + "step": 6653 + }, + { + "epoch": 2.0197298527849448, + "grad_norm": 0.6135651469230652, + "learning_rate": 6.636630555836792e-05, + "loss": 0.6737, + "step": 6654 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.6377900838851929, + "learning_rate": 6.636124329249772e-05, + "loss": 0.9863, + "step": 6655 + }, + { + "epoch": 2.0203369251783276, + "grad_norm": 0.6649699211120605, + "learning_rate": 6.635618102662751e-05, + "loss": 1.2677, + "step": 6656 + }, + { + "epoch": 2.020640461375019, + "grad_norm": 0.6146379709243774, + "learning_rate": 6.635111876075732e-05, + "loss": 1.3855, + "step": 6657 + }, + { + "epoch": 2.0209439975717105, + "grad_norm": 0.6619699001312256, + "learning_rate": 6.634605649488712e-05, + "loss": 1.0707, + "step": 6658 + }, + { + "epoch": 2.0212475337684017, + "grad_norm": 0.5690382719039917, + "learning_rate": 6.634099422901691e-05, + "loss": 1.5683, + "step": 6659 + }, + { + "epoch": 2.0215510699650934, + "grad_norm": 0.8512270450592041, + "learning_rate": 6.63359319631467e-05, + "loss": 0.9643, + "step": 6660 + }, + { + "epoch": 2.0218546061617846, + "grad_norm": 0.7763013243675232, + "learning_rate": 6.63308696972765e-05, + "loss": 0.7536, + "step": 6661 + }, + { + "epoch": 2.0221581423584762, + "grad_norm": 0.8327085971832275, + "learning_rate": 6.632580743140631e-05, + "loss": 1.3364, + "step": 6662 + }, + { + "epoch": 2.022461678555168, + "grad_norm": 0.4407320022583008, + "learning_rate": 6.63207451655361e-05, + "loss": 1.3085, + "step": 6663 + }, + { + "epoch": 2.022765214751859, + "grad_norm": 0.7932631373405457, + "learning_rate": 6.63156828996659e-05, + "loss": 1.3209, + "step": 6664 + }, + { + "epoch": 2.0230687509485508, + "grad_norm": 0.7854815125465393, + "learning_rate": 6.631062063379569e-05, + "loss": 1.191, + "step": 6665 + }, + { + "epoch": 2.023372287145242, + "grad_norm": 0.5749601721763611, + "learning_rate": 6.630555836792549e-05, + "loss": 1.7217, + "step": 6666 + }, + { + "epoch": 2.0236758233419336, + "grad_norm": 0.6307018995285034, + "learning_rate": 6.630049610205528e-05, + "loss": 1.3251, + "step": 6667 + }, + { + "epoch": 2.023979359538625, + "grad_norm": 0.7390800714492798, + "learning_rate": 6.629543383618508e-05, + "loss": 1.3801, + "step": 6668 + }, + { + "epoch": 2.0242828957353165, + "grad_norm": 1.2337613105773926, + "learning_rate": 6.629037157031489e-05, + "loss": 1.3149, + "step": 6669 + }, + { + "epoch": 2.0245864319320077, + "grad_norm": 0.7825588583946228, + "learning_rate": 6.628530930444468e-05, + "loss": 0.9752, + "step": 6670 + }, + { + "epoch": 2.0248899681286994, + "grad_norm": 0.860419511795044, + "learning_rate": 6.628024703857447e-05, + "loss": 1.4013, + "step": 6671 + }, + { + "epoch": 2.025193504325391, + "grad_norm": 0.5887535810470581, + "learning_rate": 6.627518477270427e-05, + "loss": 1.1861, + "step": 6672 + }, + { + "epoch": 2.0254970405220822, + "grad_norm": 0.6891583204269409, + "learning_rate": 6.627012250683406e-05, + "loss": 0.7213, + "step": 6673 + }, + { + "epoch": 2.025800576718774, + "grad_norm": 0.7431558966636658, + "learning_rate": 6.626506024096386e-05, + "loss": 1.2445, + "step": 6674 + }, + { + "epoch": 2.026104112915465, + "grad_norm": 0.9929723739624023, + "learning_rate": 6.625999797509365e-05, + "loss": 0.622, + "step": 6675 + }, + { + "epoch": 2.0264076491121568, + "grad_norm": 0.7005148530006409, + "learning_rate": 6.625493570922345e-05, + "loss": 1.6876, + "step": 6676 + }, + { + "epoch": 2.026711185308848, + "grad_norm": 0.6688176989555359, + "learning_rate": 6.624987344335324e-05, + "loss": 1.4479, + "step": 6677 + }, + { + "epoch": 2.0270147215055396, + "grad_norm": 0.7675127983093262, + "learning_rate": 6.624481117748305e-05, + "loss": 0.81, + "step": 6678 + }, + { + "epoch": 2.027318257702231, + "grad_norm": 0.6693247556686401, + "learning_rate": 6.623974891161285e-05, + "loss": 1.5767, + "step": 6679 + }, + { + "epoch": 2.0276217938989225, + "grad_norm": 0.741452693939209, + "learning_rate": 6.623468664574264e-05, + "loss": 1.46, + "step": 6680 + }, + { + "epoch": 2.0279253300956137, + "grad_norm": 0.6443573236465454, + "learning_rate": 6.622962437987244e-05, + "loss": 1.4561, + "step": 6681 + }, + { + "epoch": 2.0282288662923054, + "grad_norm": 0.6327553391456604, + "learning_rate": 6.622456211400223e-05, + "loss": 1.0057, + "step": 6682 + }, + { + "epoch": 2.028532402488997, + "grad_norm": 0.6773364543914795, + "learning_rate": 6.621949984813203e-05, + "loss": 1.6921, + "step": 6683 + }, + { + "epoch": 2.0288359386856882, + "grad_norm": 0.7478709816932678, + "learning_rate": 6.621443758226182e-05, + "loss": 1.5898, + "step": 6684 + }, + { + "epoch": 2.02913947488238, + "grad_norm": 0.6670467257499695, + "learning_rate": 6.620937531639162e-05, + "loss": 1.361, + "step": 6685 + }, + { + "epoch": 2.029443011079071, + "grad_norm": 0.660031795501709, + "learning_rate": 6.620431305052141e-05, + "loss": 1.3718, + "step": 6686 + }, + { + "epoch": 2.0297465472757628, + "grad_norm": 0.8065965175628662, + "learning_rate": 6.619925078465122e-05, + "loss": 1.4678, + "step": 6687 + }, + { + "epoch": 2.030050083472454, + "grad_norm": 0.756514847278595, + "learning_rate": 6.619418851878101e-05, + "loss": 1.2949, + "step": 6688 + }, + { + "epoch": 2.0303536196691456, + "grad_norm": 0.673550009727478, + "learning_rate": 6.618912625291081e-05, + "loss": 0.8403, + "step": 6689 + }, + { + "epoch": 2.030657155865837, + "grad_norm": 0.7104129195213318, + "learning_rate": 6.61840639870406e-05, + "loss": 0.9143, + "step": 6690 + }, + { + "epoch": 2.0309606920625285, + "grad_norm": 0.5645703673362732, + "learning_rate": 6.61790017211704e-05, + "loss": 0.7835, + "step": 6691 + }, + { + "epoch": 2.0312642282592197, + "grad_norm": 0.8361724615097046, + "learning_rate": 6.617393945530019e-05, + "loss": 1.345, + "step": 6692 + }, + { + "epoch": 2.0315677644559114, + "grad_norm": 0.7160463929176331, + "learning_rate": 6.616887718942999e-05, + "loss": 0.9569, + "step": 6693 + }, + { + "epoch": 2.031871300652603, + "grad_norm": 0.662090003490448, + "learning_rate": 6.616381492355978e-05, + "loss": 1.3514, + "step": 6694 + }, + { + "epoch": 2.0321748368492942, + "grad_norm": 0.6347789764404297, + "learning_rate": 6.615875265768958e-05, + "loss": 1.5523, + "step": 6695 + }, + { + "epoch": 2.032478373045986, + "grad_norm": 0.7517459392547607, + "learning_rate": 6.615369039181937e-05, + "loss": 0.4903, + "step": 6696 + }, + { + "epoch": 2.032781909242677, + "grad_norm": 0.7042916417121887, + "learning_rate": 6.614862812594918e-05, + "loss": 1.3716, + "step": 6697 + }, + { + "epoch": 2.0330854454393688, + "grad_norm": 0.6985395550727844, + "learning_rate": 6.614356586007897e-05, + "loss": 1.139, + "step": 6698 + }, + { + "epoch": 2.03338898163606, + "grad_norm": 0.7075894474983215, + "learning_rate": 6.613850359420877e-05, + "loss": 1.3291, + "step": 6699 + }, + { + "epoch": 2.0336925178327516, + "grad_norm": 0.9580946564674377, + "learning_rate": 6.613344132833856e-05, + "loss": 0.7737, + "step": 6700 + }, + { + "epoch": 2.033996054029443, + "grad_norm": 0.7944507598876953, + "learning_rate": 6.612837906246836e-05, + "loss": 1.3566, + "step": 6701 + }, + { + "epoch": 2.0342995902261345, + "grad_norm": 0.8060106039047241, + "learning_rate": 6.612331679659815e-05, + "loss": 1.3165, + "step": 6702 + }, + { + "epoch": 2.0346031264228257, + "grad_norm": 0.9689031839370728, + "learning_rate": 6.611825453072795e-05, + "loss": 1.2286, + "step": 6703 + }, + { + "epoch": 2.0349066626195174, + "grad_norm": 0.7015454173088074, + "learning_rate": 6.611319226485774e-05, + "loss": 1.2569, + "step": 6704 + }, + { + "epoch": 2.035210198816209, + "grad_norm": 0.769710898399353, + "learning_rate": 6.610812999898754e-05, + "loss": 0.9488, + "step": 6705 + }, + { + "epoch": 2.0355137350129002, + "grad_norm": 0.5991962552070618, + "learning_rate": 6.610306773311735e-05, + "loss": 1.6102, + "step": 6706 + }, + { + "epoch": 2.035817271209592, + "grad_norm": 0.6914335489273071, + "learning_rate": 6.609800546724714e-05, + "loss": 1.6039, + "step": 6707 + }, + { + "epoch": 2.036120807406283, + "grad_norm": 0.6637049913406372, + "learning_rate": 6.609294320137695e-05, + "loss": 1.2567, + "step": 6708 + }, + { + "epoch": 2.0364243436029748, + "grad_norm": 0.7493016719818115, + "learning_rate": 6.608788093550674e-05, + "loss": 1.0676, + "step": 6709 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.7196962237358093, + "learning_rate": 6.608281866963654e-05, + "loss": 1.1692, + "step": 6710 + }, + { + "epoch": 2.0370314159963576, + "grad_norm": 0.6665316820144653, + "learning_rate": 6.607775640376633e-05, + "loss": 1.4351, + "step": 6711 + }, + { + "epoch": 2.037334952193049, + "grad_norm": 0.7386513948440552, + "learning_rate": 6.607269413789613e-05, + "loss": 1.1418, + "step": 6712 + }, + { + "epoch": 2.0376384883897405, + "grad_norm": 0.6668844223022461, + "learning_rate": 6.606763187202592e-05, + "loss": 0.772, + "step": 6713 + }, + { + "epoch": 2.037942024586432, + "grad_norm": 0.9029124975204468, + "learning_rate": 6.606256960615572e-05, + "loss": 1.5783, + "step": 6714 + }, + { + "epoch": 2.0382455607831234, + "grad_norm": 0.6456940770149231, + "learning_rate": 6.605750734028551e-05, + "loss": 1.0193, + "step": 6715 + }, + { + "epoch": 2.038549096979815, + "grad_norm": 0.8892134428024292, + "learning_rate": 6.605244507441531e-05, + "loss": 0.8457, + "step": 6716 + }, + { + "epoch": 2.0388526331765062, + "grad_norm": 0.7285386323928833, + "learning_rate": 6.604738280854512e-05, + "loss": 1.2503, + "step": 6717 + }, + { + "epoch": 2.039156169373198, + "grad_norm": 0.7347849011421204, + "learning_rate": 6.604232054267491e-05, + "loss": 1.146, + "step": 6718 + }, + { + "epoch": 2.039459705569889, + "grad_norm": 0.727271556854248, + "learning_rate": 6.60372582768047e-05, + "loss": 1.4351, + "step": 6719 + }, + { + "epoch": 2.0397632417665807, + "grad_norm": 1.4093507528305054, + "learning_rate": 6.60321960109345e-05, + "loss": 0.5674, + "step": 6720 + }, + { + "epoch": 2.040066777963272, + "grad_norm": 0.4269008934497833, + "learning_rate": 6.60271337450643e-05, + "loss": 0.8792, + "step": 6721 + }, + { + "epoch": 2.0403703141599636, + "grad_norm": 0.6046574115753174, + "learning_rate": 6.602207147919409e-05, + "loss": 0.9758, + "step": 6722 + }, + { + "epoch": 2.040673850356655, + "grad_norm": 0.7757171988487244, + "learning_rate": 6.601700921332389e-05, + "loss": 1.39, + "step": 6723 + }, + { + "epoch": 2.0409773865533465, + "grad_norm": 0.8423404693603516, + "learning_rate": 6.601194694745368e-05, + "loss": 1.0898, + "step": 6724 + }, + { + "epoch": 2.041280922750038, + "grad_norm": 0.7536158561706543, + "learning_rate": 6.600688468158347e-05, + "loss": 1.0093, + "step": 6725 + }, + { + "epoch": 2.0415844589467294, + "grad_norm": 0.7584403157234192, + "learning_rate": 6.600182241571328e-05, + "loss": 1.4647, + "step": 6726 + }, + { + "epoch": 2.041887995143421, + "grad_norm": 0.7223381996154785, + "learning_rate": 6.599676014984308e-05, + "loss": 1.0329, + "step": 6727 + }, + { + "epoch": 2.042191531340112, + "grad_norm": 0.7976197600364685, + "learning_rate": 6.599169788397287e-05, + "loss": 1.3605, + "step": 6728 + }, + { + "epoch": 2.042495067536804, + "grad_norm": 0.763677179813385, + "learning_rate": 6.598663561810267e-05, + "loss": 1.2684, + "step": 6729 + }, + { + "epoch": 2.042798603733495, + "grad_norm": 0.813032329082489, + "learning_rate": 6.598157335223246e-05, + "loss": 1.4194, + "step": 6730 + }, + { + "epoch": 2.0431021399301867, + "grad_norm": 0.6929876804351807, + "learning_rate": 6.597651108636226e-05, + "loss": 1.4115, + "step": 6731 + }, + { + "epoch": 2.043405676126878, + "grad_norm": 0.8353222012519836, + "learning_rate": 6.597144882049205e-05, + "loss": 1.0849, + "step": 6732 + }, + { + "epoch": 2.0437092123235696, + "grad_norm": 0.5235282182693481, + "learning_rate": 6.596638655462185e-05, + "loss": 1.1781, + "step": 6733 + }, + { + "epoch": 2.044012748520261, + "grad_norm": 0.7527014017105103, + "learning_rate": 6.596132428875164e-05, + "loss": 1.3699, + "step": 6734 + }, + { + "epoch": 2.0443162847169525, + "grad_norm": 0.5982792377471924, + "learning_rate": 6.595626202288144e-05, + "loss": 1.5603, + "step": 6735 + }, + { + "epoch": 2.044619820913644, + "grad_norm": 1.0470722913742065, + "learning_rate": 6.595119975701124e-05, + "loss": 1.2232, + "step": 6736 + }, + { + "epoch": 2.0449233571103353, + "grad_norm": 0.7752648591995239, + "learning_rate": 6.594613749114104e-05, + "loss": 1.4848, + "step": 6737 + }, + { + "epoch": 2.045226893307027, + "grad_norm": 0.6654653549194336, + "learning_rate": 6.594107522527083e-05, + "loss": 1.6215, + "step": 6738 + }, + { + "epoch": 2.045530429503718, + "grad_norm": 0.8119179010391235, + "learning_rate": 6.593601295940063e-05, + "loss": 1.1391, + "step": 6739 + }, + { + "epoch": 2.04583396570041, + "grad_norm": 0.7314397096633911, + "learning_rate": 6.593095069353042e-05, + "loss": 1.5554, + "step": 6740 + }, + { + "epoch": 2.046137501897101, + "grad_norm": 1.1362113952636719, + "learning_rate": 6.592588842766022e-05, + "loss": 0.536, + "step": 6741 + }, + { + "epoch": 2.0464410380937927, + "grad_norm": 1.011203408241272, + "learning_rate": 6.592082616179001e-05, + "loss": 0.7209, + "step": 6742 + }, + { + "epoch": 2.046744574290484, + "grad_norm": 0.7223535776138306, + "learning_rate": 6.591576389591981e-05, + "loss": 1.3491, + "step": 6743 + }, + { + "epoch": 2.0470481104871756, + "grad_norm": 0.6654912233352661, + "learning_rate": 6.59107016300496e-05, + "loss": 1.3324, + "step": 6744 + }, + { + "epoch": 2.0473516466838673, + "grad_norm": 0.5610687136650085, + "learning_rate": 6.590563936417941e-05, + "loss": 1.0092, + "step": 6745 + }, + { + "epoch": 2.0476551828805585, + "grad_norm": 1.212891697883606, + "learning_rate": 6.59005770983092e-05, + "loss": 1.586, + "step": 6746 + }, + { + "epoch": 2.04795871907725, + "grad_norm": 0.7124261856079102, + "learning_rate": 6.5895514832439e-05, + "loss": 1.2772, + "step": 6747 + }, + { + "epoch": 2.0482622552739413, + "grad_norm": 0.8526630997657776, + "learning_rate": 6.58904525665688e-05, + "loss": 1.0371, + "step": 6748 + }, + { + "epoch": 2.048565791470633, + "grad_norm": 0.697640597820282, + "learning_rate": 6.588539030069859e-05, + "loss": 1.1681, + "step": 6749 + }, + { + "epoch": 2.048869327667324, + "grad_norm": 0.627429187297821, + "learning_rate": 6.588032803482839e-05, + "loss": 1.5366, + "step": 6750 + }, + { + "epoch": 2.049172863864016, + "grad_norm": 0.650415301322937, + "learning_rate": 6.58752657689582e-05, + "loss": 0.9809, + "step": 6751 + }, + { + "epoch": 2.049476400060707, + "grad_norm": 0.560741126537323, + "learning_rate": 6.587020350308799e-05, + "loss": 1.8711, + "step": 6752 + }, + { + "epoch": 2.0497799362573987, + "grad_norm": 0.6494907736778259, + "learning_rate": 6.586514123721778e-05, + "loss": 1.5271, + "step": 6753 + }, + { + "epoch": 2.05008347245409, + "grad_norm": 0.7896031141281128, + "learning_rate": 6.586007897134758e-05, + "loss": 1.2083, + "step": 6754 + }, + { + "epoch": 2.0503870086507816, + "grad_norm": 1.015256404876709, + "learning_rate": 6.585501670547737e-05, + "loss": 1.3374, + "step": 6755 + }, + { + "epoch": 2.0506905448474733, + "grad_norm": 0.7154099345207214, + "learning_rate": 6.584995443960718e-05, + "loss": 1.039, + "step": 6756 + }, + { + "epoch": 2.0509940810441645, + "grad_norm": 0.6052323579788208, + "learning_rate": 6.584489217373698e-05, + "loss": 1.5066, + "step": 6757 + }, + { + "epoch": 2.051297617240856, + "grad_norm": 0.6472240090370178, + "learning_rate": 6.583982990786677e-05, + "loss": 1.468, + "step": 6758 + }, + { + "epoch": 2.0516011534375473, + "grad_norm": 0.6906863451004028, + "learning_rate": 6.583476764199657e-05, + "loss": 1.5298, + "step": 6759 + }, + { + "epoch": 2.051904689634239, + "grad_norm": 0.8233827948570251, + "learning_rate": 6.582970537612636e-05, + "loss": 0.9356, + "step": 6760 + }, + { + "epoch": 2.05220822583093, + "grad_norm": 0.6991246938705444, + "learning_rate": 6.582464311025616e-05, + "loss": 1.5262, + "step": 6761 + }, + { + "epoch": 2.052511762027622, + "grad_norm": 0.8245313763618469, + "learning_rate": 6.581958084438595e-05, + "loss": 1.1714, + "step": 6762 + }, + { + "epoch": 2.052815298224313, + "grad_norm": 0.7749815583229065, + "learning_rate": 6.581451857851574e-05, + "loss": 0.8446, + "step": 6763 + }, + { + "epoch": 2.0531188344210047, + "grad_norm": 1.0583815574645996, + "learning_rate": 6.580945631264554e-05, + "loss": 1.2243, + "step": 6764 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.5424079298973083, + "learning_rate": 6.580439404677535e-05, + "loss": 1.34, + "step": 6765 + }, + { + "epoch": 2.0537259068143876, + "grad_norm": 0.7240017652511597, + "learning_rate": 6.579933178090514e-05, + "loss": 0.9282, + "step": 6766 + }, + { + "epoch": 2.0540294430110793, + "grad_norm": 0.692767322063446, + "learning_rate": 6.579426951503494e-05, + "loss": 1.4957, + "step": 6767 + }, + { + "epoch": 2.0543329792077705, + "grad_norm": 0.7238978147506714, + "learning_rate": 6.578920724916473e-05, + "loss": 1.2649, + "step": 6768 + }, + { + "epoch": 2.054636515404462, + "grad_norm": 0.5727119445800781, + "learning_rate": 6.578414498329453e-05, + "loss": 1.1351, + "step": 6769 + }, + { + "epoch": 2.0549400516011533, + "grad_norm": 0.8237214088439941, + "learning_rate": 6.577908271742432e-05, + "loss": 1.352, + "step": 6770 + }, + { + "epoch": 2.055243587797845, + "grad_norm": 0.71387779712677, + "learning_rate": 6.577402045155412e-05, + "loss": 1.2913, + "step": 6771 + }, + { + "epoch": 2.055547123994536, + "grad_norm": 0.6894492506980896, + "learning_rate": 6.576895818568391e-05, + "loss": 1.3507, + "step": 6772 + }, + { + "epoch": 2.055850660191228, + "grad_norm": 0.7831125855445862, + "learning_rate": 6.57638959198137e-05, + "loss": 1.2404, + "step": 6773 + }, + { + "epoch": 2.056154196387919, + "grad_norm": 0.6227384805679321, + "learning_rate": 6.57588336539435e-05, + "loss": 1.1396, + "step": 6774 + }, + { + "epoch": 2.0564577325846107, + "grad_norm": 0.7114644646644592, + "learning_rate": 6.575377138807331e-05, + "loss": 1.1895, + "step": 6775 + }, + { + "epoch": 2.0567612687813024, + "grad_norm": 0.7928556799888611, + "learning_rate": 6.57487091222031e-05, + "loss": 1.1681, + "step": 6776 + }, + { + "epoch": 2.0570648049779936, + "grad_norm": 0.8106712102890015, + "learning_rate": 6.57436468563329e-05, + "loss": 0.9562, + "step": 6777 + }, + { + "epoch": 2.0573683411746853, + "grad_norm": 0.792510986328125, + "learning_rate": 6.57385845904627e-05, + "loss": 1.2987, + "step": 6778 + }, + { + "epoch": 2.0576718773713765, + "grad_norm": 0.8379880785942078, + "learning_rate": 6.573352232459249e-05, + "loss": 1.5292, + "step": 6779 + }, + { + "epoch": 2.057975413568068, + "grad_norm": 0.7367234826087952, + "learning_rate": 6.572846005872228e-05, + "loss": 1.478, + "step": 6780 + }, + { + "epoch": 2.0582789497647593, + "grad_norm": 0.9199308753013611, + "learning_rate": 6.572339779285208e-05, + "loss": 1.0923, + "step": 6781 + }, + { + "epoch": 2.058582485961451, + "grad_norm": 0.6934099197387695, + "learning_rate": 6.571833552698187e-05, + "loss": 0.7662, + "step": 6782 + }, + { + "epoch": 2.058886022158142, + "grad_norm": 0.7979916334152222, + "learning_rate": 6.571327326111167e-05, + "loss": 1.4459, + "step": 6783 + }, + { + "epoch": 2.059189558354834, + "grad_norm": 0.6911612749099731, + "learning_rate": 6.570821099524148e-05, + "loss": 1.357, + "step": 6784 + }, + { + "epoch": 2.059493094551525, + "grad_norm": 0.7487713098526001, + "learning_rate": 6.570314872937127e-05, + "loss": 1.3231, + "step": 6785 + }, + { + "epoch": 2.0597966307482167, + "grad_norm": 0.7138471603393555, + "learning_rate": 6.569808646350107e-05, + "loss": 1.6311, + "step": 6786 + }, + { + "epoch": 2.0601001669449084, + "grad_norm": 0.911524772644043, + "learning_rate": 6.569302419763086e-05, + "loss": 1.2923, + "step": 6787 + }, + { + "epoch": 2.0604037031415996, + "grad_norm": 0.779221773147583, + "learning_rate": 6.568796193176066e-05, + "loss": 1.518, + "step": 6788 + }, + { + "epoch": 2.0607072393382913, + "grad_norm": 0.7897789478302002, + "learning_rate": 6.568289966589045e-05, + "loss": 1.2195, + "step": 6789 + }, + { + "epoch": 2.0610107755349825, + "grad_norm": 0.5560900568962097, + "learning_rate": 6.567783740002024e-05, + "loss": 1.3346, + "step": 6790 + }, + { + "epoch": 2.061314311731674, + "grad_norm": 0.7780158519744873, + "learning_rate": 6.567277513415004e-05, + "loss": 1.283, + "step": 6791 + }, + { + "epoch": 2.0616178479283653, + "grad_norm": 0.8399558067321777, + "learning_rate": 6.566771286827983e-05, + "loss": 1.5827, + "step": 6792 + }, + { + "epoch": 2.061921384125057, + "grad_norm": 0.6238407492637634, + "learning_rate": 6.566265060240964e-05, + "loss": 1.0665, + "step": 6793 + }, + { + "epoch": 2.062224920321748, + "grad_norm": 0.8223831057548523, + "learning_rate": 6.565758833653944e-05, + "loss": 1.3561, + "step": 6794 + }, + { + "epoch": 2.06252845651844, + "grad_norm": 0.5908865332603455, + "learning_rate": 6.565252607066925e-05, + "loss": 1.5198, + "step": 6795 + }, + { + "epoch": 2.062831992715131, + "grad_norm": 0.7312744855880737, + "learning_rate": 6.564746380479904e-05, + "loss": 1.3568, + "step": 6796 + }, + { + "epoch": 2.0631355289118227, + "grad_norm": 0.794439435005188, + "learning_rate": 6.564240153892884e-05, + "loss": 1.1679, + "step": 6797 + }, + { + "epoch": 2.0634390651085144, + "grad_norm": 0.58745276927948, + "learning_rate": 6.563733927305863e-05, + "loss": 1.4664, + "step": 6798 + }, + { + "epoch": 2.0637426013052056, + "grad_norm": 0.9755748510360718, + "learning_rate": 6.563227700718843e-05, + "loss": 1.0525, + "step": 6799 + }, + { + "epoch": 2.0640461375018972, + "grad_norm": 0.7289798855781555, + "learning_rate": 6.562721474131822e-05, + "loss": 1.392, + "step": 6800 + }, + { + "epoch": 2.0643496736985885, + "grad_norm": 0.7182273864746094, + "learning_rate": 6.562215247544801e-05, + "loss": 1.3755, + "step": 6801 + }, + { + "epoch": 2.06465320989528, + "grad_norm": 0.6822550892829895, + "learning_rate": 6.561709020957781e-05, + "loss": 1.6559, + "step": 6802 + }, + { + "epoch": 2.0649567460919713, + "grad_norm": 0.838714599609375, + "learning_rate": 6.56120279437076e-05, + "loss": 1.291, + "step": 6803 + }, + { + "epoch": 2.065260282288663, + "grad_norm": 1.2011395692825317, + "learning_rate": 6.560696567783741e-05, + "loss": 1.1399, + "step": 6804 + }, + { + "epoch": 2.065563818485354, + "grad_norm": 0.8120669722557068, + "learning_rate": 6.560190341196721e-05, + "loss": 1.1055, + "step": 6805 + }, + { + "epoch": 2.065867354682046, + "grad_norm": 0.7051123976707458, + "learning_rate": 6.5596841146097e-05, + "loss": 1.3991, + "step": 6806 + }, + { + "epoch": 2.0661708908787375, + "grad_norm": 0.8569958806037903, + "learning_rate": 6.55917788802268e-05, + "loss": 1.2338, + "step": 6807 + }, + { + "epoch": 2.0664744270754287, + "grad_norm": 0.5877040028572083, + "learning_rate": 6.558671661435659e-05, + "loss": 1.2854, + "step": 6808 + }, + { + "epoch": 2.0667779632721204, + "grad_norm": 0.6768965125083923, + "learning_rate": 6.558165434848639e-05, + "loss": 1.4867, + "step": 6809 + }, + { + "epoch": 2.0670814994688116, + "grad_norm": 0.6104425191879272, + "learning_rate": 6.557659208261618e-05, + "loss": 1.2566, + "step": 6810 + }, + { + "epoch": 2.0673850356655032, + "grad_norm": 0.8459896445274353, + "learning_rate": 6.557152981674598e-05, + "loss": 0.9956, + "step": 6811 + }, + { + "epoch": 2.0676885718621945, + "grad_norm": 0.4863753914833069, + "learning_rate": 6.556646755087577e-05, + "loss": 1.5894, + "step": 6812 + }, + { + "epoch": 2.067992108058886, + "grad_norm": 0.4200766086578369, + "learning_rate": 6.556140528500557e-05, + "loss": 1.3425, + "step": 6813 + }, + { + "epoch": 2.0682956442555773, + "grad_norm": 0.6907030940055847, + "learning_rate": 6.555634301913537e-05, + "loss": 1.4898, + "step": 6814 + }, + { + "epoch": 2.068599180452269, + "grad_norm": 0.5948358774185181, + "learning_rate": 6.555128075326517e-05, + "loss": 1.2708, + "step": 6815 + }, + { + "epoch": 2.06890271664896, + "grad_norm": 0.8029438257217407, + "learning_rate": 6.554621848739496e-05, + "loss": 1.4541, + "step": 6816 + }, + { + "epoch": 2.069206252845652, + "grad_norm": 0.7332115173339844, + "learning_rate": 6.554115622152476e-05, + "loss": 1.4676, + "step": 6817 + }, + { + "epoch": 2.0695097890423435, + "grad_norm": 1.0768547058105469, + "learning_rate": 6.553609395565455e-05, + "loss": 1.1385, + "step": 6818 + }, + { + "epoch": 2.0698133252390347, + "grad_norm": 0.7721269726753235, + "learning_rate": 6.553103168978435e-05, + "loss": 1.4892, + "step": 6819 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.7033138275146484, + "learning_rate": 6.552596942391414e-05, + "loss": 1.3887, + "step": 6820 + }, + { + "epoch": 2.0704203976324176, + "grad_norm": 0.9070338010787964, + "learning_rate": 6.552090715804394e-05, + "loss": 1.3903, + "step": 6821 + }, + { + "epoch": 2.0707239338291092, + "grad_norm": 0.7374278903007507, + "learning_rate": 6.551584489217373e-05, + "loss": 1.3918, + "step": 6822 + }, + { + "epoch": 2.0710274700258005, + "grad_norm": 0.734194815158844, + "learning_rate": 6.551078262630354e-05, + "loss": 1.3215, + "step": 6823 + }, + { + "epoch": 2.071331006222492, + "grad_norm": 0.5715214610099792, + "learning_rate": 6.550572036043334e-05, + "loss": 1.5389, + "step": 6824 + }, + { + "epoch": 2.0716345424191833, + "grad_norm": 0.7062519192695618, + "learning_rate": 6.550065809456313e-05, + "loss": 1.2238, + "step": 6825 + }, + { + "epoch": 2.071938078615875, + "grad_norm": 0.6726373434066772, + "learning_rate": 6.549559582869293e-05, + "loss": 1.3836, + "step": 6826 + }, + { + "epoch": 2.072241614812566, + "grad_norm": 0.7280587553977966, + "learning_rate": 6.549053356282272e-05, + "loss": 1.7095, + "step": 6827 + }, + { + "epoch": 2.072545151009258, + "grad_norm": 0.6290583610534668, + "learning_rate": 6.548547129695251e-05, + "loss": 1.177, + "step": 6828 + }, + { + "epoch": 2.0728486872059495, + "grad_norm": 0.8294208645820618, + "learning_rate": 6.548040903108231e-05, + "loss": 1.0588, + "step": 6829 + }, + { + "epoch": 2.0731522234026407, + "grad_norm": 0.6298262476921082, + "learning_rate": 6.54753467652121e-05, + "loss": 0.8205, + "step": 6830 + }, + { + "epoch": 2.0734557595993324, + "grad_norm": 0.7950041890144348, + "learning_rate": 6.54702844993419e-05, + "loss": 1.1332, + "step": 6831 + }, + { + "epoch": 2.0737592957960236, + "grad_norm": 0.8786191940307617, + "learning_rate": 6.546522223347171e-05, + "loss": 1.1891, + "step": 6832 + }, + { + "epoch": 2.0740628319927152, + "grad_norm": 0.7599530816078186, + "learning_rate": 6.54601599676015e-05, + "loss": 1.4681, + "step": 6833 + }, + { + "epoch": 2.0743663681894065, + "grad_norm": 0.7391897439956665, + "learning_rate": 6.54550977017313e-05, + "loss": 1.3549, + "step": 6834 + }, + { + "epoch": 2.074669904386098, + "grad_norm": 1.192667007446289, + "learning_rate": 6.545003543586109e-05, + "loss": 1.3083, + "step": 6835 + }, + { + "epoch": 2.0749734405827893, + "grad_norm": 0.5997875332832336, + "learning_rate": 6.544497316999089e-05, + "loss": 1.1885, + "step": 6836 + }, + { + "epoch": 2.075276976779481, + "grad_norm": 0.5613851547241211, + "learning_rate": 6.543991090412068e-05, + "loss": 1.4794, + "step": 6837 + }, + { + "epoch": 2.075580512976172, + "grad_norm": 1.016721487045288, + "learning_rate": 6.543484863825048e-05, + "loss": 1.3954, + "step": 6838 + }, + { + "epoch": 2.075884049172864, + "grad_norm": 0.7319069504737854, + "learning_rate": 6.542978637238027e-05, + "loss": 1.1868, + "step": 6839 + }, + { + "epoch": 2.0761875853695555, + "grad_norm": 0.8343705534934998, + "learning_rate": 6.542472410651008e-05, + "loss": 1.025, + "step": 6840 + }, + { + "epoch": 2.0764911215662467, + "grad_norm": 0.6604619026184082, + "learning_rate": 6.541966184063987e-05, + "loss": 0.9608, + "step": 6841 + }, + { + "epoch": 2.0767946577629384, + "grad_norm": 0.7044042944908142, + "learning_rate": 6.541459957476967e-05, + "loss": 1.2809, + "step": 6842 + }, + { + "epoch": 2.0770981939596296, + "grad_norm": 0.7844612002372742, + "learning_rate": 6.540953730889948e-05, + "loss": 1.1977, + "step": 6843 + }, + { + "epoch": 2.0774017301563212, + "grad_norm": 0.6686902642250061, + "learning_rate": 6.540447504302927e-05, + "loss": 1.6281, + "step": 6844 + }, + { + "epoch": 2.0777052663530124, + "grad_norm": 0.8168292045593262, + "learning_rate": 6.539941277715907e-05, + "loss": 1.5787, + "step": 6845 + }, + { + "epoch": 2.078008802549704, + "grad_norm": 0.7469481229782104, + "learning_rate": 6.539435051128886e-05, + "loss": 0.922, + "step": 6846 + }, + { + "epoch": 2.0783123387463953, + "grad_norm": 0.797624945640564, + "learning_rate": 6.538928824541866e-05, + "loss": 1.7119, + "step": 6847 + }, + { + "epoch": 2.078615874943087, + "grad_norm": 0.6301280856132507, + "learning_rate": 6.538422597954845e-05, + "loss": 1.2768, + "step": 6848 + }, + { + "epoch": 2.0789194111397786, + "grad_norm": 0.834862232208252, + "learning_rate": 6.537916371367825e-05, + "loss": 1.4923, + "step": 6849 + }, + { + "epoch": 2.07922294733647, + "grad_norm": 0.8240933418273926, + "learning_rate": 6.537410144780804e-05, + "loss": 1.1109, + "step": 6850 + }, + { + "epoch": 2.0795264835331615, + "grad_norm": 0.6094114780426025, + "learning_rate": 6.536903918193784e-05, + "loss": 1.2795, + "step": 6851 + }, + { + "epoch": 2.0798300197298527, + "grad_norm": 0.7783163189888, + "learning_rate": 6.536397691606763e-05, + "loss": 1.2485, + "step": 6852 + }, + { + "epoch": 2.0801335559265444, + "grad_norm": 0.6736633777618408, + "learning_rate": 6.535891465019744e-05, + "loss": 1.4631, + "step": 6853 + }, + { + "epoch": 2.0804370921232356, + "grad_norm": 0.7661424279212952, + "learning_rate": 6.535385238432723e-05, + "loss": 1.1812, + "step": 6854 + }, + { + "epoch": 2.0807406283199272, + "grad_norm": 0.4843573272228241, + "learning_rate": 6.534879011845703e-05, + "loss": 1.1932, + "step": 6855 + }, + { + "epoch": 2.0810441645166184, + "grad_norm": 0.6728962063789368, + "learning_rate": 6.534372785258682e-05, + "loss": 1.4108, + "step": 6856 + }, + { + "epoch": 2.08134770071331, + "grad_norm": 0.7444415092468262, + "learning_rate": 6.533866558671662e-05, + "loss": 1.647, + "step": 6857 + }, + { + "epoch": 2.0816512369100013, + "grad_norm": 0.8934520483016968, + "learning_rate": 6.533360332084641e-05, + "loss": 1.444, + "step": 6858 + }, + { + "epoch": 2.081954773106693, + "grad_norm": 0.7796207070350647, + "learning_rate": 6.532854105497621e-05, + "loss": 1.3639, + "step": 6859 + }, + { + "epoch": 2.0822583093033846, + "grad_norm": 0.7691407799720764, + "learning_rate": 6.5323478789106e-05, + "loss": 1.4077, + "step": 6860 + }, + { + "epoch": 2.082561845500076, + "grad_norm": 0.7715175747871399, + "learning_rate": 6.53184165232358e-05, + "loss": 1.0336, + "step": 6861 + }, + { + "epoch": 2.0828653816967675, + "grad_norm": 0.8336794376373291, + "learning_rate": 6.53133542573656e-05, + "loss": 1.2306, + "step": 6862 + }, + { + "epoch": 2.0831689178934587, + "grad_norm": 0.7209054827690125, + "learning_rate": 6.53082919914954e-05, + "loss": 0.8951, + "step": 6863 + }, + { + "epoch": 2.0834724540901504, + "grad_norm": 0.7160457372665405, + "learning_rate": 6.53032297256252e-05, + "loss": 1.2746, + "step": 6864 + }, + { + "epoch": 2.0837759902868416, + "grad_norm": 0.8172768354415894, + "learning_rate": 6.529816745975499e-05, + "loss": 0.9319, + "step": 6865 + }, + { + "epoch": 2.0840795264835332, + "grad_norm": 0.626451849937439, + "learning_rate": 6.529310519388478e-05, + "loss": 1.4666, + "step": 6866 + }, + { + "epoch": 2.0843830626802244, + "grad_norm": 0.6044541597366333, + "learning_rate": 6.528804292801458e-05, + "loss": 0.9215, + "step": 6867 + }, + { + "epoch": 2.084686598876916, + "grad_norm": 0.6968386769294739, + "learning_rate": 6.528298066214437e-05, + "loss": 1.3271, + "step": 6868 + }, + { + "epoch": 2.0849901350736078, + "grad_norm": 0.8236109614372253, + "learning_rate": 6.527791839627417e-05, + "loss": 1.354, + "step": 6869 + }, + { + "epoch": 2.085293671270299, + "grad_norm": 0.6705644130706787, + "learning_rate": 6.527285613040396e-05, + "loss": 1.2462, + "step": 6870 + }, + { + "epoch": 2.0855972074669906, + "grad_norm": 0.8627272844314575, + "learning_rate": 6.526779386453377e-05, + "loss": 1.2606, + "step": 6871 + }, + { + "epoch": 2.085900743663682, + "grad_norm": 0.7665202617645264, + "learning_rate": 6.526273159866357e-05, + "loss": 1.073, + "step": 6872 + }, + { + "epoch": 2.0862042798603735, + "grad_norm": 0.7215783596038818, + "learning_rate": 6.525766933279336e-05, + "loss": 1.2895, + "step": 6873 + }, + { + "epoch": 2.0865078160570647, + "grad_norm": 0.8607796430587769, + "learning_rate": 6.525260706692316e-05, + "loss": 0.9202, + "step": 6874 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.8341158032417297, + "learning_rate": 6.524754480105295e-05, + "loss": 1.056, + "step": 6875 + }, + { + "epoch": 2.0871148884504476, + "grad_norm": 0.8036590218544006, + "learning_rate": 6.524248253518275e-05, + "loss": 1.3272, + "step": 6876 + }, + { + "epoch": 2.0874184246471392, + "grad_norm": 1.0152392387390137, + "learning_rate": 6.523742026931254e-05, + "loss": 1.3854, + "step": 6877 + }, + { + "epoch": 2.0877219608438304, + "grad_norm": 0.6920920610427856, + "learning_rate": 6.523235800344234e-05, + "loss": 1.6818, + "step": 6878 + }, + { + "epoch": 2.088025497040522, + "grad_norm": 0.8714667558670044, + "learning_rate": 6.522729573757213e-05, + "loss": 1.3507, + "step": 6879 + }, + { + "epoch": 2.0883290332372137, + "grad_norm": 0.7897412776947021, + "learning_rate": 6.522223347170193e-05, + "loss": 1.472, + "step": 6880 + }, + { + "epoch": 2.088632569433905, + "grad_norm": 0.739872932434082, + "learning_rate": 6.521717120583173e-05, + "loss": 1.7092, + "step": 6881 + }, + { + "epoch": 2.0889361056305966, + "grad_norm": 0.6356867551803589, + "learning_rate": 6.521210893996153e-05, + "loss": 1.3256, + "step": 6882 + }, + { + "epoch": 2.089239641827288, + "grad_norm": 0.6607017517089844, + "learning_rate": 6.520704667409132e-05, + "loss": 1.0546, + "step": 6883 + }, + { + "epoch": 2.0895431780239795, + "grad_norm": 0.6129403114318848, + "learning_rate": 6.520198440822113e-05, + "loss": 1.3164, + "step": 6884 + }, + { + "epoch": 2.0898467142206707, + "grad_norm": 0.6967512369155884, + "learning_rate": 6.519692214235093e-05, + "loss": 1.3868, + "step": 6885 + }, + { + "epoch": 2.0901502504173624, + "grad_norm": 0.6828495860099792, + "learning_rate": 6.519185987648072e-05, + "loss": 1.5314, + "step": 6886 + }, + { + "epoch": 2.0904537866140536, + "grad_norm": 0.6235039234161377, + "learning_rate": 6.518679761061052e-05, + "loss": 1.0667, + "step": 6887 + }, + { + "epoch": 2.0907573228107452, + "grad_norm": 0.8770887851715088, + "learning_rate": 6.518173534474031e-05, + "loss": 1.2815, + "step": 6888 + }, + { + "epoch": 2.0910608590074364, + "grad_norm": 0.6768497824668884, + "learning_rate": 6.51766730788701e-05, + "loss": 1.2396, + "step": 6889 + }, + { + "epoch": 2.091364395204128, + "grad_norm": 0.7875215411186218, + "learning_rate": 6.51716108129999e-05, + "loss": 1.329, + "step": 6890 + }, + { + "epoch": 2.0916679314008197, + "grad_norm": 0.6576293706893921, + "learning_rate": 6.51665485471297e-05, + "loss": 1.5352, + "step": 6891 + }, + { + "epoch": 2.091971467597511, + "grad_norm": 1.1005089282989502, + "learning_rate": 6.51614862812595e-05, + "loss": 1.2099, + "step": 6892 + }, + { + "epoch": 2.0922750037942026, + "grad_norm": 0.6148272752761841, + "learning_rate": 6.51564240153893e-05, + "loss": 0.9366, + "step": 6893 + }, + { + "epoch": 2.092578539990894, + "grad_norm": 0.5925896167755127, + "learning_rate": 6.51513617495191e-05, + "loss": 1.4104, + "step": 6894 + }, + { + "epoch": 2.0928820761875855, + "grad_norm": 0.7109888792037964, + "learning_rate": 6.514629948364889e-05, + "loss": 1.3707, + "step": 6895 + }, + { + "epoch": 2.0931856123842767, + "grad_norm": 0.6179462671279907, + "learning_rate": 6.514123721777868e-05, + "loss": 1.339, + "step": 6896 + }, + { + "epoch": 2.0934891485809684, + "grad_norm": 1.1650300025939941, + "learning_rate": 6.513617495190848e-05, + "loss": 1.1503, + "step": 6897 + }, + { + "epoch": 2.0937926847776596, + "grad_norm": 0.507260799407959, + "learning_rate": 6.513111268603827e-05, + "loss": 1.6823, + "step": 6898 + }, + { + "epoch": 2.094096220974351, + "grad_norm": 0.718826174736023, + "learning_rate": 6.512605042016807e-05, + "loss": 1.4065, + "step": 6899 + }, + { + "epoch": 2.0943997571710424, + "grad_norm": 0.7679693698883057, + "learning_rate": 6.512098815429786e-05, + "loss": 1.0725, + "step": 6900 + }, + { + "epoch": 2.094703293367734, + "grad_norm": 0.8759974241256714, + "learning_rate": 6.511592588842767e-05, + "loss": 1.576, + "step": 6901 + }, + { + "epoch": 2.0950068295644257, + "grad_norm": 0.7068794369697571, + "learning_rate": 6.511086362255747e-05, + "loss": 1.4895, + "step": 6902 + }, + { + "epoch": 2.095310365761117, + "grad_norm": 0.8207367062568665, + "learning_rate": 6.510580135668726e-05, + "loss": 1.2166, + "step": 6903 + }, + { + "epoch": 2.0956139019578086, + "grad_norm": 0.6428210735321045, + "learning_rate": 6.510073909081705e-05, + "loss": 0.7282, + "step": 6904 + }, + { + "epoch": 2.0959174381545, + "grad_norm": 0.7879844307899475, + "learning_rate": 6.509567682494685e-05, + "loss": 1.4551, + "step": 6905 + }, + { + "epoch": 2.0962209743511915, + "grad_norm": 0.5059447884559631, + "learning_rate": 6.509061455907664e-05, + "loss": 0.7051, + "step": 6906 + }, + { + "epoch": 2.0965245105478827, + "grad_norm": 0.9014713168144226, + "learning_rate": 6.508555229320644e-05, + "loss": 0.9653, + "step": 6907 + }, + { + "epoch": 2.0968280467445743, + "grad_norm": 0.7185696363449097, + "learning_rate": 6.508049002733623e-05, + "loss": 1.2964, + "step": 6908 + }, + { + "epoch": 2.0971315829412656, + "grad_norm": 0.7701075673103333, + "learning_rate": 6.507542776146603e-05, + "loss": 1.3248, + "step": 6909 + }, + { + "epoch": 2.097435119137957, + "grad_norm": 0.8957740068435669, + "learning_rate": 6.507036549559584e-05, + "loss": 1.5749, + "step": 6910 + }, + { + "epoch": 2.097738655334649, + "grad_norm": 0.7184285521507263, + "learning_rate": 6.506530322972563e-05, + "loss": 1.1747, + "step": 6911 + }, + { + "epoch": 2.09804219153134, + "grad_norm": 0.7738087177276611, + "learning_rate": 6.506024096385543e-05, + "loss": 1.5662, + "step": 6912 + }, + { + "epoch": 2.0983457277280317, + "grad_norm": 0.7775218486785889, + "learning_rate": 6.505517869798522e-05, + "loss": 1.1571, + "step": 6913 + }, + { + "epoch": 2.098649263924723, + "grad_norm": 0.6527006030082703, + "learning_rate": 6.505011643211502e-05, + "loss": 1.0058, + "step": 6914 + }, + { + "epoch": 2.0989528001214146, + "grad_norm": 0.5389887690544128, + "learning_rate": 6.504505416624481e-05, + "loss": 1.3973, + "step": 6915 + }, + { + "epoch": 2.099256336318106, + "grad_norm": 0.8791207671165466, + "learning_rate": 6.50399919003746e-05, + "loss": 1.1055, + "step": 6916 + }, + { + "epoch": 2.0995598725147975, + "grad_norm": 0.6769183278083801, + "learning_rate": 6.50349296345044e-05, + "loss": 1.0076, + "step": 6917 + }, + { + "epoch": 2.0998634087114887, + "grad_norm": 0.7800582051277161, + "learning_rate": 6.50298673686342e-05, + "loss": 1.3135, + "step": 6918 + }, + { + "epoch": 2.1001669449081803, + "grad_norm": 0.6300270557403564, + "learning_rate": 6.502480510276399e-05, + "loss": 1.354, + "step": 6919 + }, + { + "epoch": 2.1004704811048716, + "grad_norm": 0.8697200417518616, + "learning_rate": 6.50197428368938e-05, + "loss": 1.2348, + "step": 6920 + }, + { + "epoch": 2.100774017301563, + "grad_norm": 0.7431923151016235, + "learning_rate": 6.50146805710236e-05, + "loss": 1.2332, + "step": 6921 + }, + { + "epoch": 2.101077553498255, + "grad_norm": 0.7904285192489624, + "learning_rate": 6.500961830515339e-05, + "loss": 1.0013, + "step": 6922 + }, + { + "epoch": 2.101381089694946, + "grad_norm": 0.7916845679283142, + "learning_rate": 6.500455603928318e-05, + "loss": 1.2898, + "step": 6923 + }, + { + "epoch": 2.1016846258916377, + "grad_norm": 0.785738468170166, + "learning_rate": 6.499949377341298e-05, + "loss": 1.6627, + "step": 6924 + }, + { + "epoch": 2.101988162088329, + "grad_norm": 0.8456923365592957, + "learning_rate": 6.499443150754277e-05, + "loss": 1.3163, + "step": 6925 + }, + { + "epoch": 2.1022916982850206, + "grad_norm": 0.607767641544342, + "learning_rate": 6.498936924167257e-05, + "loss": 1.5545, + "step": 6926 + }, + { + "epoch": 2.102595234481712, + "grad_norm": 0.7314704656600952, + "learning_rate": 6.498430697580236e-05, + "loss": 1.4258, + "step": 6927 + }, + { + "epoch": 2.1028987706784035, + "grad_norm": 0.7437986135482788, + "learning_rate": 6.497924470993216e-05, + "loss": 1.4507, + "step": 6928 + }, + { + "epoch": 2.1032023068750947, + "grad_norm": 0.6680194139480591, + "learning_rate": 6.497418244406197e-05, + "loss": 1.0842, + "step": 6929 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.8030607104301453, + "learning_rate": 6.496912017819176e-05, + "loss": 0.9657, + "step": 6930 + }, + { + "epoch": 2.1038093792684776, + "grad_norm": 0.9499925971031189, + "learning_rate": 6.496405791232157e-05, + "loss": 1.3172, + "step": 6931 + }, + { + "epoch": 2.104112915465169, + "grad_norm": 0.6064600348472595, + "learning_rate": 6.495899564645136e-05, + "loss": 1.5706, + "step": 6932 + }, + { + "epoch": 2.104416451661861, + "grad_norm": 0.7133974432945251, + "learning_rate": 6.495393338058116e-05, + "loss": 0.6775, + "step": 6933 + }, + { + "epoch": 2.104719987858552, + "grad_norm": 0.7160429954528809, + "learning_rate": 6.494887111471095e-05, + "loss": 1.4584, + "step": 6934 + }, + { + "epoch": 2.1050235240552437, + "grad_norm": 0.6486138701438904, + "learning_rate": 6.494380884884075e-05, + "loss": 0.9498, + "step": 6935 + }, + { + "epoch": 2.105327060251935, + "grad_norm": 0.8397301435470581, + "learning_rate": 6.493874658297054e-05, + "loss": 1.3461, + "step": 6936 + }, + { + "epoch": 2.1056305964486266, + "grad_norm": 0.7628511786460876, + "learning_rate": 6.493368431710034e-05, + "loss": 1.556, + "step": 6937 + }, + { + "epoch": 2.105934132645318, + "grad_norm": 0.7449336647987366, + "learning_rate": 6.492862205123013e-05, + "loss": 1.3249, + "step": 6938 + }, + { + "epoch": 2.1062376688420095, + "grad_norm": 0.5359498262405396, + "learning_rate": 6.492355978535993e-05, + "loss": 0.9697, + "step": 6939 + }, + { + "epoch": 2.1065412050387007, + "grad_norm": 0.6996994614601135, + "learning_rate": 6.491849751948974e-05, + "loss": 1.2401, + "step": 6940 + }, + { + "epoch": 2.1068447412353923, + "grad_norm": 0.9314258694648743, + "learning_rate": 6.491343525361953e-05, + "loss": 1.42, + "step": 6941 + }, + { + "epoch": 2.107148277432084, + "grad_norm": 0.8202747702598572, + "learning_rate": 6.490837298774932e-05, + "loss": 1.1554, + "step": 6942 + }, + { + "epoch": 2.107451813628775, + "grad_norm": 0.8922327756881714, + "learning_rate": 6.490331072187912e-05, + "loss": 1.0766, + "step": 6943 + }, + { + "epoch": 2.107755349825467, + "grad_norm": 0.6795501708984375, + "learning_rate": 6.489824845600891e-05, + "loss": 1.0089, + "step": 6944 + }, + { + "epoch": 2.108058886022158, + "grad_norm": 0.5984511971473694, + "learning_rate": 6.489318619013871e-05, + "loss": 1.297, + "step": 6945 + }, + { + "epoch": 2.1083624222188497, + "grad_norm": 0.6586083769798279, + "learning_rate": 6.48881239242685e-05, + "loss": 1.3293, + "step": 6946 + }, + { + "epoch": 2.108665958415541, + "grad_norm": 0.6119650602340698, + "learning_rate": 6.48830616583983e-05, + "loss": 0.6706, + "step": 6947 + }, + { + "epoch": 2.1089694946122326, + "grad_norm": 0.8025054335594177, + "learning_rate": 6.48779993925281e-05, + "loss": 1.1377, + "step": 6948 + }, + { + "epoch": 2.109273030808924, + "grad_norm": 0.88395756483078, + "learning_rate": 6.48729371266579e-05, + "loss": 1.1058, + "step": 6949 + }, + { + "epoch": 2.1095765670056155, + "grad_norm": 0.7020336389541626, + "learning_rate": 6.48678748607877e-05, + "loss": 1.1002, + "step": 6950 + }, + { + "epoch": 2.1098801032023067, + "grad_norm": 0.6096410155296326, + "learning_rate": 6.486281259491749e-05, + "loss": 1.5273, + "step": 6951 + }, + { + "epoch": 2.1101836393989983, + "grad_norm": 0.6165260672569275, + "learning_rate": 6.485775032904729e-05, + "loss": 1.0085, + "step": 6952 + }, + { + "epoch": 2.11048717559569, + "grad_norm": 0.6977746486663818, + "learning_rate": 6.485268806317708e-05, + "loss": 1.5208, + "step": 6953 + }, + { + "epoch": 2.110790711792381, + "grad_norm": 0.7142164707183838, + "learning_rate": 6.484762579730688e-05, + "loss": 1.4821, + "step": 6954 + }, + { + "epoch": 2.111094247989073, + "grad_norm": 0.7390041351318359, + "learning_rate": 6.484256353143667e-05, + "loss": 1.218, + "step": 6955 + }, + { + "epoch": 2.111397784185764, + "grad_norm": 0.8610092997550964, + "learning_rate": 6.483750126556647e-05, + "loss": 1.1259, + "step": 6956 + }, + { + "epoch": 2.1117013203824557, + "grad_norm": 0.807457447052002, + "learning_rate": 6.483243899969626e-05, + "loss": 1.3557, + "step": 6957 + }, + { + "epoch": 2.112004856579147, + "grad_norm": 0.7799239754676819, + "learning_rate": 6.482737673382606e-05, + "loss": 1.1888, + "step": 6958 + }, + { + "epoch": 2.1123083927758386, + "grad_norm": 0.7552769780158997, + "learning_rate": 6.482231446795586e-05, + "loss": 1.2615, + "step": 6959 + }, + { + "epoch": 2.11261192897253, + "grad_norm": 0.8106013536453247, + "learning_rate": 6.481725220208566e-05, + "loss": 1.3605, + "step": 6960 + }, + { + "epoch": 2.1129154651692215, + "grad_norm": 0.6781620383262634, + "learning_rate": 6.481218993621545e-05, + "loss": 1.2664, + "step": 6961 + }, + { + "epoch": 2.1132190013659127, + "grad_norm": 0.8459703326225281, + "learning_rate": 6.480712767034525e-05, + "loss": 1.6223, + "step": 6962 + }, + { + "epoch": 2.1135225375626043, + "grad_norm": 0.6246853470802307, + "learning_rate": 6.480206540447504e-05, + "loss": 1.4559, + "step": 6963 + }, + { + "epoch": 2.113826073759296, + "grad_norm": 0.5837286114692688, + "learning_rate": 6.479700313860484e-05, + "loss": 1.1062, + "step": 6964 + }, + { + "epoch": 2.114129609955987, + "grad_norm": 0.5617137551307678, + "learning_rate": 6.479194087273463e-05, + "loss": 1.0567, + "step": 6965 + }, + { + "epoch": 2.114433146152679, + "grad_norm": 0.7016701102256775, + "learning_rate": 6.478687860686443e-05, + "loss": 1.4226, + "step": 6966 + }, + { + "epoch": 2.11473668234937, + "grad_norm": 0.7260595560073853, + "learning_rate": 6.478181634099422e-05, + "loss": 1.3857, + "step": 6967 + }, + { + "epoch": 2.1150402185460617, + "grad_norm": 0.5604128837585449, + "learning_rate": 6.477675407512403e-05, + "loss": 1.6041, + "step": 6968 + }, + { + "epoch": 2.115343754742753, + "grad_norm": 0.8095130324363708, + "learning_rate": 6.477169180925382e-05, + "loss": 1.2849, + "step": 6969 + }, + { + "epoch": 2.1156472909394446, + "grad_norm": 0.7363386750221252, + "learning_rate": 6.476662954338362e-05, + "loss": 1.3677, + "step": 6970 + }, + { + "epoch": 2.115950827136136, + "grad_norm": 0.7337886095046997, + "learning_rate": 6.476156727751341e-05, + "loss": 1.5134, + "step": 6971 + }, + { + "epoch": 2.1162543633328275, + "grad_norm": 0.7493162155151367, + "learning_rate": 6.475650501164321e-05, + "loss": 0.8987, + "step": 6972 + }, + { + "epoch": 2.1165578995295187, + "grad_norm": 0.7532170414924622, + "learning_rate": 6.475144274577302e-05, + "loss": 1.398, + "step": 6973 + }, + { + "epoch": 2.1168614357262103, + "grad_norm": 0.7840539813041687, + "learning_rate": 6.474638047990281e-05, + "loss": 1.5745, + "step": 6974 + }, + { + "epoch": 2.117164971922902, + "grad_norm": 0.7847781777381897, + "learning_rate": 6.474131821403261e-05, + "loss": 1.4612, + "step": 6975 + }, + { + "epoch": 2.117468508119593, + "grad_norm": 0.6995275020599365, + "learning_rate": 6.47362559481624e-05, + "loss": 1.4264, + "step": 6976 + }, + { + "epoch": 2.117772044316285, + "grad_norm": 0.9341135621070862, + "learning_rate": 6.47311936822922e-05, + "loss": 1.3877, + "step": 6977 + }, + { + "epoch": 2.118075580512976, + "grad_norm": 0.7916807532310486, + "learning_rate": 6.472613141642199e-05, + "loss": 1.0889, + "step": 6978 + }, + { + "epoch": 2.1183791167096677, + "grad_norm": 0.789832592010498, + "learning_rate": 6.47210691505518e-05, + "loss": 1.0189, + "step": 6979 + }, + { + "epoch": 2.118682652906359, + "grad_norm": 1.2565356492996216, + "learning_rate": 6.47160068846816e-05, + "loss": 1.1957, + "step": 6980 + }, + { + "epoch": 2.1189861891030506, + "grad_norm": 0.5494474768638611, + "learning_rate": 6.471094461881139e-05, + "loss": 0.9361, + "step": 6981 + }, + { + "epoch": 2.119289725299742, + "grad_norm": 0.6757771968841553, + "learning_rate": 6.470588235294118e-05, + "loss": 1.3835, + "step": 6982 + }, + { + "epoch": 2.1195932614964335, + "grad_norm": 0.875653088092804, + "learning_rate": 6.470082008707098e-05, + "loss": 1.3736, + "step": 6983 + }, + { + "epoch": 2.119896797693125, + "grad_norm": 0.8260795474052429, + "learning_rate": 6.469575782120077e-05, + "loss": 0.9047, + "step": 6984 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.6649807095527649, + "learning_rate": 6.469069555533057e-05, + "loss": 0.8367, + "step": 6985 + }, + { + "epoch": 2.120503870086508, + "grad_norm": 0.7601671814918518, + "learning_rate": 6.468563328946036e-05, + "loss": 1.4975, + "step": 6986 + }, + { + "epoch": 2.120807406283199, + "grad_norm": 0.7919801473617554, + "learning_rate": 6.468057102359016e-05, + "loss": 1.2241, + "step": 6987 + }, + { + "epoch": 2.121110942479891, + "grad_norm": 0.8133627772331238, + "learning_rate": 6.467550875771997e-05, + "loss": 1.3707, + "step": 6988 + }, + { + "epoch": 2.121414478676582, + "grad_norm": 0.7252787947654724, + "learning_rate": 6.467044649184976e-05, + "loss": 0.9922, + "step": 6989 + }, + { + "epoch": 2.1217180148732737, + "grad_norm": 0.7415847182273865, + "learning_rate": 6.466538422597956e-05, + "loss": 0.3215, + "step": 6990 + }, + { + "epoch": 2.122021551069965, + "grad_norm": 0.5860079526901245, + "learning_rate": 6.466032196010935e-05, + "loss": 1.6003, + "step": 6991 + }, + { + "epoch": 2.1223250872666566, + "grad_norm": 0.8746591806411743, + "learning_rate": 6.465525969423915e-05, + "loss": 1.1195, + "step": 6992 + }, + { + "epoch": 2.122628623463348, + "grad_norm": 0.7454144954681396, + "learning_rate": 6.465019742836894e-05, + "loss": 1.0494, + "step": 6993 + }, + { + "epoch": 2.1229321596600395, + "grad_norm": 0.7970776557922363, + "learning_rate": 6.464513516249874e-05, + "loss": 1.5679, + "step": 6994 + }, + { + "epoch": 2.123235695856731, + "grad_norm": 0.9127287864685059, + "learning_rate": 6.464007289662853e-05, + "loss": 1.1722, + "step": 6995 + }, + { + "epoch": 2.1235392320534223, + "grad_norm": 0.5542991161346436, + "learning_rate": 6.463501063075833e-05, + "loss": 1.7838, + "step": 6996 + }, + { + "epoch": 2.123842768250114, + "grad_norm": 0.8293723464012146, + "learning_rate": 6.462994836488812e-05, + "loss": 1.3326, + "step": 6997 + }, + { + "epoch": 2.124146304446805, + "grad_norm": 0.8169488310813904, + "learning_rate": 6.462488609901793e-05, + "loss": 1.0697, + "step": 6998 + }, + { + "epoch": 2.124449840643497, + "grad_norm": 0.6636808514595032, + "learning_rate": 6.461982383314772e-05, + "loss": 1.2945, + "step": 6999 + }, + { + "epoch": 2.124753376840188, + "grad_norm": 0.982451856136322, + "learning_rate": 6.461476156727752e-05, + "loss": 1.0061, + "step": 7000 + }, + { + "epoch": 2.1250569130368797, + "grad_norm": 0.8102851510047913, + "learning_rate": 6.460969930140731e-05, + "loss": 1.1035, + "step": 7001 + }, + { + "epoch": 2.125360449233571, + "grad_norm": 0.915728747844696, + "learning_rate": 6.460463703553711e-05, + "loss": 1.1196, + "step": 7002 + }, + { + "epoch": 2.1256639854302626, + "grad_norm": 0.8686944842338562, + "learning_rate": 6.45995747696669e-05, + "loss": 1.092, + "step": 7003 + }, + { + "epoch": 2.1259675216269542, + "grad_norm": 0.7031385898590088, + "learning_rate": 6.45945125037967e-05, + "loss": 1.202, + "step": 7004 + }, + { + "epoch": 2.1262710578236454, + "grad_norm": 0.7822988629341125, + "learning_rate": 6.458945023792649e-05, + "loss": 1.1374, + "step": 7005 + }, + { + "epoch": 2.126574594020337, + "grad_norm": 0.6261591911315918, + "learning_rate": 6.458438797205629e-05, + "loss": 1.091, + "step": 7006 + }, + { + "epoch": 2.1268781302170283, + "grad_norm": 0.7939580082893372, + "learning_rate": 6.45793257061861e-05, + "loss": 0.9854, + "step": 7007 + }, + { + "epoch": 2.12718166641372, + "grad_norm": 0.6935524940490723, + "learning_rate": 6.457426344031589e-05, + "loss": 1.5513, + "step": 7008 + }, + { + "epoch": 2.127485202610411, + "grad_norm": 0.6789997816085815, + "learning_rate": 6.456920117444568e-05, + "loss": 1.5058, + "step": 7009 + }, + { + "epoch": 2.127788738807103, + "grad_norm": 0.60723876953125, + "learning_rate": 6.456413890857548e-05, + "loss": 1.2242, + "step": 7010 + }, + { + "epoch": 2.128092275003794, + "grad_norm": 0.927089512348175, + "learning_rate": 6.455907664270527e-05, + "loss": 1.2623, + "step": 7011 + }, + { + "epoch": 2.1283958112004857, + "grad_norm": 0.6675049066543579, + "learning_rate": 6.455401437683507e-05, + "loss": 1.6692, + "step": 7012 + }, + { + "epoch": 2.128699347397177, + "grad_norm": 0.6772061586380005, + "learning_rate": 6.454895211096486e-05, + "loss": 1.4969, + "step": 7013 + }, + { + "epoch": 2.1290028835938686, + "grad_norm": 0.820059597492218, + "learning_rate": 6.454388984509466e-05, + "loss": 1.3942, + "step": 7014 + }, + { + "epoch": 2.1293064197905602, + "grad_norm": 0.6421627998352051, + "learning_rate": 6.453882757922445e-05, + "loss": 1.245, + "step": 7015 + }, + { + "epoch": 2.1296099559872514, + "grad_norm": 0.813875675201416, + "learning_rate": 6.453376531335426e-05, + "loss": 1.1182, + "step": 7016 + }, + { + "epoch": 2.129913492183943, + "grad_norm": 0.7777447700500488, + "learning_rate": 6.452870304748406e-05, + "loss": 1.2357, + "step": 7017 + }, + { + "epoch": 2.1302170283806343, + "grad_norm": 0.7813501358032227, + "learning_rate": 6.452364078161386e-05, + "loss": 1.3377, + "step": 7018 + }, + { + "epoch": 2.130520564577326, + "grad_norm": 0.6702557802200317, + "learning_rate": 6.451857851574366e-05, + "loss": 0.772, + "step": 7019 + }, + { + "epoch": 2.130824100774017, + "grad_norm": 0.5846662521362305, + "learning_rate": 6.451351624987345e-05, + "loss": 1.1361, + "step": 7020 + }, + { + "epoch": 2.131127636970709, + "grad_norm": 0.677483856678009, + "learning_rate": 6.450845398400325e-05, + "loss": 1.6367, + "step": 7021 + }, + { + "epoch": 2.1314311731674, + "grad_norm": 0.7289956212043762, + "learning_rate": 6.450339171813304e-05, + "loss": 1.5031, + "step": 7022 + }, + { + "epoch": 2.1317347093640917, + "grad_norm": 0.7819235324859619, + "learning_rate": 6.449832945226284e-05, + "loss": 1.1371, + "step": 7023 + }, + { + "epoch": 2.132038245560783, + "grad_norm": 0.6361702680587769, + "learning_rate": 6.449326718639263e-05, + "loss": 0.8756, + "step": 7024 + }, + { + "epoch": 2.1323417817574746, + "grad_norm": 0.6113557815551758, + "learning_rate": 6.448820492052243e-05, + "loss": 1.0094, + "step": 7025 + }, + { + "epoch": 2.1326453179541662, + "grad_norm": 0.7358946800231934, + "learning_rate": 6.448314265465222e-05, + "loss": 1.2521, + "step": 7026 + }, + { + "epoch": 2.1329488541508574, + "grad_norm": 0.8237258195877075, + "learning_rate": 6.447808038878203e-05, + "loss": 1.4365, + "step": 7027 + }, + { + "epoch": 2.133252390347549, + "grad_norm": 0.7153835892677307, + "learning_rate": 6.447301812291183e-05, + "loss": 1.1588, + "step": 7028 + }, + { + "epoch": 2.1335559265442403, + "grad_norm": 0.793786346912384, + "learning_rate": 6.446795585704162e-05, + "loss": 1.2753, + "step": 7029 + }, + { + "epoch": 2.133859462740932, + "grad_norm": 0.7200156450271606, + "learning_rate": 6.446289359117142e-05, + "loss": 0.7547, + "step": 7030 + }, + { + "epoch": 2.134162998937623, + "grad_norm": 0.6647316813468933, + "learning_rate": 6.445783132530121e-05, + "loss": 1.3108, + "step": 7031 + }, + { + "epoch": 2.134466535134315, + "grad_norm": 0.7873644828796387, + "learning_rate": 6.4452769059431e-05, + "loss": 1.4656, + "step": 7032 + }, + { + "epoch": 2.134770071331006, + "grad_norm": 0.6672101616859436, + "learning_rate": 6.44477067935608e-05, + "loss": 1.5885, + "step": 7033 + }, + { + "epoch": 2.1350736075276977, + "grad_norm": 0.9114254713058472, + "learning_rate": 6.44426445276906e-05, + "loss": 0.8359, + "step": 7034 + }, + { + "epoch": 2.135377143724389, + "grad_norm": 0.6775479316711426, + "learning_rate": 6.443758226182039e-05, + "loss": 1.4429, + "step": 7035 + }, + { + "epoch": 2.1356806799210806, + "grad_norm": 0.7736831307411194, + "learning_rate": 6.443251999595018e-05, + "loss": 1.0994, + "step": 7036 + }, + { + "epoch": 2.1359842161177722, + "grad_norm": 0.6583807468414307, + "learning_rate": 6.442745773007999e-05, + "loss": 1.0765, + "step": 7037 + }, + { + "epoch": 2.1362877523144634, + "grad_norm": 0.6119073629379272, + "learning_rate": 6.442239546420979e-05, + "loss": 1.475, + "step": 7038 + }, + { + "epoch": 2.136591288511155, + "grad_norm": 0.8550370931625366, + "learning_rate": 6.441733319833958e-05, + "loss": 1.1355, + "step": 7039 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.7692386507987976, + "learning_rate": 6.441227093246938e-05, + "loss": 1.1812, + "step": 7040 + }, + { + "epoch": 2.137198360904538, + "grad_norm": 0.7219559550285339, + "learning_rate": 6.440720866659917e-05, + "loss": 1.4073, + "step": 7041 + }, + { + "epoch": 2.137501897101229, + "grad_norm": 0.6751419901847839, + "learning_rate": 6.440214640072897e-05, + "loss": 0.9089, + "step": 7042 + }, + { + "epoch": 2.137805433297921, + "grad_norm": 0.7274761199951172, + "learning_rate": 6.439708413485876e-05, + "loss": 1.4312, + "step": 7043 + }, + { + "epoch": 2.138108969494612, + "grad_norm": 0.8742554187774658, + "learning_rate": 6.439202186898856e-05, + "loss": 1.098, + "step": 7044 + }, + { + "epoch": 2.1384125056913037, + "grad_norm": 0.6010596752166748, + "learning_rate": 6.438695960311835e-05, + "loss": 1.1069, + "step": 7045 + }, + { + "epoch": 2.138716041887995, + "grad_norm": 0.7470264434814453, + "learning_rate": 6.438189733724816e-05, + "loss": 1.0936, + "step": 7046 + }, + { + "epoch": 2.1390195780846866, + "grad_norm": 0.620542049407959, + "learning_rate": 6.437683507137795e-05, + "loss": 1.3027, + "step": 7047 + }, + { + "epoch": 2.1393231142813782, + "grad_norm": 0.7897995710372925, + "learning_rate": 6.437177280550775e-05, + "loss": 1.3731, + "step": 7048 + }, + { + "epoch": 2.1396266504780694, + "grad_norm": 0.6937667727470398, + "learning_rate": 6.436671053963754e-05, + "loss": 0.8121, + "step": 7049 + }, + { + "epoch": 2.139930186674761, + "grad_norm": 0.7295348644256592, + "learning_rate": 6.436164827376734e-05, + "loss": 1.0723, + "step": 7050 + }, + { + "epoch": 2.1402337228714523, + "grad_norm": 0.7741212248802185, + "learning_rate": 6.435658600789713e-05, + "loss": 1.233, + "step": 7051 + }, + { + "epoch": 2.140537259068144, + "grad_norm": 0.7276745438575745, + "learning_rate": 6.435152374202693e-05, + "loss": 1.6296, + "step": 7052 + }, + { + "epoch": 2.140840795264835, + "grad_norm": 0.7447855472564697, + "learning_rate": 6.434646147615672e-05, + "loss": 1.0548, + "step": 7053 + }, + { + "epoch": 2.141144331461527, + "grad_norm": 0.6452568173408508, + "learning_rate": 6.434139921028652e-05, + "loss": 1.0692, + "step": 7054 + }, + { + "epoch": 2.141447867658218, + "grad_norm": 0.851493775844574, + "learning_rate": 6.433633694441633e-05, + "loss": 1.3675, + "step": 7055 + }, + { + "epoch": 2.1417514038549097, + "grad_norm": 0.6594175696372986, + "learning_rate": 6.433127467854612e-05, + "loss": 1.2718, + "step": 7056 + }, + { + "epoch": 2.1420549400516014, + "grad_norm": 0.9136268496513367, + "learning_rate": 6.432621241267592e-05, + "loss": 0.8236, + "step": 7057 + }, + { + "epoch": 2.1423584762482926, + "grad_norm": 0.8292484879493713, + "learning_rate": 6.432115014680571e-05, + "loss": 1.2484, + "step": 7058 + }, + { + "epoch": 2.142662012444984, + "grad_norm": 0.7154607772827148, + "learning_rate": 6.43160878809355e-05, + "loss": 1.5388, + "step": 7059 + }, + { + "epoch": 2.1429655486416754, + "grad_norm": 0.7923582196235657, + "learning_rate": 6.43110256150653e-05, + "loss": 1.1922, + "step": 7060 + }, + { + "epoch": 2.143269084838367, + "grad_norm": 0.779097855091095, + "learning_rate": 6.43059633491951e-05, + "loss": 1.3437, + "step": 7061 + }, + { + "epoch": 2.1435726210350583, + "grad_norm": 0.5886580348014832, + "learning_rate": 6.43009010833249e-05, + "loss": 1.1589, + "step": 7062 + }, + { + "epoch": 2.14387615723175, + "grad_norm": 0.7474644184112549, + "learning_rate": 6.42958388174547e-05, + "loss": 1.4397, + "step": 7063 + }, + { + "epoch": 2.144179693428441, + "grad_norm": 0.6950685977935791, + "learning_rate": 6.429077655158449e-05, + "loss": 1.4432, + "step": 7064 + }, + { + "epoch": 2.144483229625133, + "grad_norm": 0.9418857097625732, + "learning_rate": 6.428571428571429e-05, + "loss": 1.1887, + "step": 7065 + }, + { + "epoch": 2.1447867658218245, + "grad_norm": 0.9572475552558899, + "learning_rate": 6.42806520198441e-05, + "loss": 1.453, + "step": 7066 + }, + { + "epoch": 2.1450903020185157, + "grad_norm": 0.799579918384552, + "learning_rate": 6.427558975397389e-05, + "loss": 1.3219, + "step": 7067 + }, + { + "epoch": 2.1453938382152073, + "grad_norm": 0.7733772993087769, + "learning_rate": 6.427052748810369e-05, + "loss": 1.4549, + "step": 7068 + }, + { + "epoch": 2.1456973744118986, + "grad_norm": 0.7334538698196411, + "learning_rate": 6.426546522223348e-05, + "loss": 1.4377, + "step": 7069 + }, + { + "epoch": 2.14600091060859, + "grad_norm": 0.6159878373146057, + "learning_rate": 6.426040295636328e-05, + "loss": 1.6952, + "step": 7070 + }, + { + "epoch": 2.1463044468052814, + "grad_norm": 0.8680784702301025, + "learning_rate": 6.425534069049307e-05, + "loss": 1.2469, + "step": 7071 + }, + { + "epoch": 2.146607983001973, + "grad_norm": 1.383750081062317, + "learning_rate": 6.425027842462286e-05, + "loss": 0.6063, + "step": 7072 + }, + { + "epoch": 2.1469115191986643, + "grad_norm": 0.6771996021270752, + "learning_rate": 6.424521615875266e-05, + "loss": 1.3287, + "step": 7073 + }, + { + "epoch": 2.147215055395356, + "grad_norm": 0.5610653162002563, + "learning_rate": 6.424015389288245e-05, + "loss": 1.5291, + "step": 7074 + }, + { + "epoch": 2.147518591592047, + "grad_norm": 0.7199169993400574, + "learning_rate": 6.423509162701225e-05, + "loss": 1.3695, + "step": 7075 + }, + { + "epoch": 2.147822127788739, + "grad_norm": 0.7059941291809082, + "learning_rate": 6.423002936114206e-05, + "loss": 0.8093, + "step": 7076 + }, + { + "epoch": 2.1481256639854305, + "grad_norm": 0.6837321519851685, + "learning_rate": 6.422496709527185e-05, + "loss": 1.3739, + "step": 7077 + }, + { + "epoch": 2.1484292001821217, + "grad_norm": 0.5235718488693237, + "learning_rate": 6.421990482940165e-05, + "loss": 1.292, + "step": 7078 + }, + { + "epoch": 2.1487327363788133, + "grad_norm": 0.6904390454292297, + "learning_rate": 6.421484256353144e-05, + "loss": 1.3419, + "step": 7079 + }, + { + "epoch": 2.1490362725755046, + "grad_norm": 0.6799563765525818, + "learning_rate": 6.420978029766124e-05, + "loss": 1.1699, + "step": 7080 + }, + { + "epoch": 2.149339808772196, + "grad_norm": 1.023763656616211, + "learning_rate": 6.420471803179103e-05, + "loss": 1.1691, + "step": 7081 + }, + { + "epoch": 2.1496433449688874, + "grad_norm": 0.6300176978111267, + "learning_rate": 6.419965576592083e-05, + "loss": 1.5441, + "step": 7082 + }, + { + "epoch": 2.149946881165579, + "grad_norm": 0.6105098128318787, + "learning_rate": 6.419459350005062e-05, + "loss": 1.0628, + "step": 7083 + }, + { + "epoch": 2.1502504173622703, + "grad_norm": 0.5488935112953186, + "learning_rate": 6.418953123418042e-05, + "loss": 1.3458, + "step": 7084 + }, + { + "epoch": 2.150553953558962, + "grad_norm": 0.6905235648155212, + "learning_rate": 6.418446896831022e-05, + "loss": 1.6791, + "step": 7085 + }, + { + "epoch": 2.150857489755653, + "grad_norm": 0.7746590375900269, + "learning_rate": 6.417940670244002e-05, + "loss": 1.2937, + "step": 7086 + }, + { + "epoch": 2.151161025952345, + "grad_norm": 0.6267426013946533, + "learning_rate": 6.417434443656981e-05, + "loss": 1.2033, + "step": 7087 + }, + { + "epoch": 2.1514645621490365, + "grad_norm": 0.7372788786888123, + "learning_rate": 6.416928217069961e-05, + "loss": 1.2304, + "step": 7088 + }, + { + "epoch": 2.1517680983457277, + "grad_norm": 0.6180323362350464, + "learning_rate": 6.41642199048294e-05, + "loss": 1.5392, + "step": 7089 + }, + { + "epoch": 2.1520716345424193, + "grad_norm": 0.6933856010437012, + "learning_rate": 6.41591576389592e-05, + "loss": 1.314, + "step": 7090 + }, + { + "epoch": 2.1523751707391106, + "grad_norm": 0.8059110641479492, + "learning_rate": 6.415409537308899e-05, + "loss": 1.4774, + "step": 7091 + }, + { + "epoch": 2.152678706935802, + "grad_norm": 0.7373825907707214, + "learning_rate": 6.414903310721879e-05, + "loss": 1.0461, + "step": 7092 + }, + { + "epoch": 2.1529822431324934, + "grad_norm": 0.8026410341262817, + "learning_rate": 6.414397084134858e-05, + "loss": 0.846, + "step": 7093 + }, + { + "epoch": 2.153285779329185, + "grad_norm": 0.6906172037124634, + "learning_rate": 6.413890857547839e-05, + "loss": 1.3885, + "step": 7094 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 1.1794425249099731, + "learning_rate": 6.413384630960819e-05, + "loss": 1.5075, + "step": 7095 + }, + { + "epoch": 2.153892851722568, + "grad_norm": 0.6099388003349304, + "learning_rate": 6.412878404373798e-05, + "loss": 1.1185, + "step": 7096 + }, + { + "epoch": 2.154196387919259, + "grad_norm": 0.7119961977005005, + "learning_rate": 6.412372177786778e-05, + "loss": 1.12, + "step": 7097 + }, + { + "epoch": 2.154499924115951, + "grad_norm": 0.7217562198638916, + "learning_rate": 6.411865951199757e-05, + "loss": 1.6259, + "step": 7098 + }, + { + "epoch": 2.1548034603126425, + "grad_norm": 0.7318760752677917, + "learning_rate": 6.411359724612737e-05, + "loss": 1.3047, + "step": 7099 + }, + { + "epoch": 2.1551069965093337, + "grad_norm": 0.7855448126792908, + "learning_rate": 6.410853498025716e-05, + "loss": 1.0818, + "step": 7100 + }, + { + "epoch": 2.1554105327060253, + "grad_norm": 0.7480208873748779, + "learning_rate": 6.410347271438695e-05, + "loss": 1.447, + "step": 7101 + }, + { + "epoch": 2.1557140689027166, + "grad_norm": 0.5373287796974182, + "learning_rate": 6.409841044851675e-05, + "loss": 1.7865, + "step": 7102 + }, + { + "epoch": 2.156017605099408, + "grad_norm": 0.9216307401657104, + "learning_rate": 6.409334818264654e-05, + "loss": 1.2057, + "step": 7103 + }, + { + "epoch": 2.1563211412960994, + "grad_norm": 0.7226337194442749, + "learning_rate": 6.408828591677635e-05, + "loss": 1.2489, + "step": 7104 + }, + { + "epoch": 2.156624677492791, + "grad_norm": 0.7996892929077148, + "learning_rate": 6.408322365090615e-05, + "loss": 1.4067, + "step": 7105 + }, + { + "epoch": 2.1569282136894823, + "grad_norm": 0.8255277872085571, + "learning_rate": 6.407816138503594e-05, + "loss": 1.0818, + "step": 7106 + }, + { + "epoch": 2.157231749886174, + "grad_norm": 0.7836741209030151, + "learning_rate": 6.407309911916575e-05, + "loss": 1.3656, + "step": 7107 + }, + { + "epoch": 2.157535286082865, + "grad_norm": 0.6470088362693787, + "learning_rate": 6.406803685329555e-05, + "loss": 1.0784, + "step": 7108 + }, + { + "epoch": 2.157838822279557, + "grad_norm": 0.6564112305641174, + "learning_rate": 6.406297458742534e-05, + "loss": 0.5021, + "step": 7109 + }, + { + "epoch": 2.1581423584762485, + "grad_norm": 0.6398088932037354, + "learning_rate": 6.405791232155513e-05, + "loss": 0.8162, + "step": 7110 + }, + { + "epoch": 2.1584458946729397, + "grad_norm": 0.8116580843925476, + "learning_rate": 6.405285005568493e-05, + "loss": 1.1574, + "step": 7111 + }, + { + "epoch": 2.1587494308696313, + "grad_norm": 1.0613367557525635, + "learning_rate": 6.404778778981472e-05, + "loss": 0.9812, + "step": 7112 + }, + { + "epoch": 2.1590529670663225, + "grad_norm": 0.7346512675285339, + "learning_rate": 6.404272552394452e-05, + "loss": 1.0567, + "step": 7113 + }, + { + "epoch": 2.159356503263014, + "grad_norm": 0.8014817237854004, + "learning_rate": 6.403766325807431e-05, + "loss": 1.4299, + "step": 7114 + }, + { + "epoch": 2.1596600394597054, + "grad_norm": 0.7765456438064575, + "learning_rate": 6.403260099220412e-05, + "loss": 1.4795, + "step": 7115 + }, + { + "epoch": 2.159963575656397, + "grad_norm": 0.6675356030464172, + "learning_rate": 6.402753872633392e-05, + "loss": 1.5016, + "step": 7116 + }, + { + "epoch": 2.1602671118530883, + "grad_norm": 0.8694089651107788, + "learning_rate": 6.402247646046371e-05, + "loss": 1.3718, + "step": 7117 + }, + { + "epoch": 2.16057064804978, + "grad_norm": 0.708865761756897, + "learning_rate": 6.401741419459351e-05, + "loss": 1.4019, + "step": 7118 + }, + { + "epoch": 2.1608741842464716, + "grad_norm": 0.624944806098938, + "learning_rate": 6.40123519287233e-05, + "loss": 1.346, + "step": 7119 + }, + { + "epoch": 2.161177720443163, + "grad_norm": 0.9114837050437927, + "learning_rate": 6.40072896628531e-05, + "loss": 1.2589, + "step": 7120 + }, + { + "epoch": 2.1614812566398545, + "grad_norm": 0.8413217067718506, + "learning_rate": 6.400222739698289e-05, + "loss": 1.3506, + "step": 7121 + }, + { + "epoch": 2.1617847928365457, + "grad_norm": 0.8894316554069519, + "learning_rate": 6.399716513111269e-05, + "loss": 1.5452, + "step": 7122 + }, + { + "epoch": 2.1620883290332373, + "grad_norm": 0.6457502245903015, + "learning_rate": 6.399210286524248e-05, + "loss": 0.5962, + "step": 7123 + }, + { + "epoch": 2.1623918652299285, + "grad_norm": 0.7590650320053101, + "learning_rate": 6.398704059937229e-05, + "loss": 1.6314, + "step": 7124 + }, + { + "epoch": 2.16269540142662, + "grad_norm": 0.6571962237358093, + "learning_rate": 6.398197833350208e-05, + "loss": 1.7331, + "step": 7125 + }, + { + "epoch": 2.1629989376233114, + "grad_norm": 0.8318181037902832, + "learning_rate": 6.397691606763188e-05, + "loss": 1.3389, + "step": 7126 + }, + { + "epoch": 2.163302473820003, + "grad_norm": 0.696306049823761, + "learning_rate": 6.397185380176167e-05, + "loss": 1.5082, + "step": 7127 + }, + { + "epoch": 2.1636060100166947, + "grad_norm": 0.7968202233314514, + "learning_rate": 6.396679153589147e-05, + "loss": 1.1368, + "step": 7128 + }, + { + "epoch": 2.163909546213386, + "grad_norm": 0.6538952589035034, + "learning_rate": 6.396172927002126e-05, + "loss": 1.2814, + "step": 7129 + }, + { + "epoch": 2.1642130824100776, + "grad_norm": 0.6986178159713745, + "learning_rate": 6.395666700415106e-05, + "loss": 1.3288, + "step": 7130 + }, + { + "epoch": 2.164516618606769, + "grad_norm": 0.7502809762954712, + "learning_rate": 6.395160473828085e-05, + "loss": 1.2749, + "step": 7131 + }, + { + "epoch": 2.1648201548034605, + "grad_norm": 0.7333081960678101, + "learning_rate": 6.394654247241065e-05, + "loss": 1.6966, + "step": 7132 + }, + { + "epoch": 2.1651236910001517, + "grad_norm": 0.7244516015052795, + "learning_rate": 6.394148020654046e-05, + "loss": 1.0677, + "step": 7133 + }, + { + "epoch": 2.1654272271968433, + "grad_norm": 0.613068163394928, + "learning_rate": 6.393641794067025e-05, + "loss": 1.2863, + "step": 7134 + }, + { + "epoch": 2.1657307633935345, + "grad_norm": 0.8424353003501892, + "learning_rate": 6.393135567480005e-05, + "loss": 1.0378, + "step": 7135 + }, + { + "epoch": 2.166034299590226, + "grad_norm": 1.0322976112365723, + "learning_rate": 6.392629340892984e-05, + "loss": 1.4644, + "step": 7136 + }, + { + "epoch": 2.1663378357869174, + "grad_norm": 0.6831735968589783, + "learning_rate": 6.392123114305964e-05, + "loss": 1.3927, + "step": 7137 + }, + { + "epoch": 2.166641371983609, + "grad_norm": 0.7594115138053894, + "learning_rate": 6.391616887718943e-05, + "loss": 1.3165, + "step": 7138 + }, + { + "epoch": 2.1669449081803007, + "grad_norm": 0.6064585447311401, + "learning_rate": 6.391110661131922e-05, + "loss": 0.8763, + "step": 7139 + }, + { + "epoch": 2.167248444376992, + "grad_norm": 0.840460479259491, + "learning_rate": 6.390604434544902e-05, + "loss": 1.5395, + "step": 7140 + }, + { + "epoch": 2.1675519805736836, + "grad_norm": 0.9302356839179993, + "learning_rate": 6.390098207957881e-05, + "loss": 1.2421, + "step": 7141 + }, + { + "epoch": 2.167855516770375, + "grad_norm": 0.7110074162483215, + "learning_rate": 6.389591981370861e-05, + "loss": 0.7126, + "step": 7142 + }, + { + "epoch": 2.1681590529670665, + "grad_norm": 0.9885009527206421, + "learning_rate": 6.389085754783842e-05, + "loss": 0.9977, + "step": 7143 + }, + { + "epoch": 2.1684625891637577, + "grad_norm": 0.8145515322685242, + "learning_rate": 6.388579528196821e-05, + "loss": 1.1864, + "step": 7144 + }, + { + "epoch": 2.1687661253604493, + "grad_norm": 0.6856915354728699, + "learning_rate": 6.388073301609801e-05, + "loss": 1.3053, + "step": 7145 + }, + { + "epoch": 2.1690696615571405, + "grad_norm": 0.6998456120491028, + "learning_rate": 6.38756707502278e-05, + "loss": 1.798, + "step": 7146 + }, + { + "epoch": 2.169373197753832, + "grad_norm": 0.7821376919746399, + "learning_rate": 6.38706084843576e-05, + "loss": 1.4081, + "step": 7147 + }, + { + "epoch": 2.1696767339505234, + "grad_norm": 0.7204583883285522, + "learning_rate": 6.386554621848739e-05, + "loss": 1.3859, + "step": 7148 + }, + { + "epoch": 2.169980270147215, + "grad_norm": 0.8906077146530151, + "learning_rate": 6.386048395261719e-05, + "loss": 0.8652, + "step": 7149 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.7288920879364014, + "learning_rate": 6.385542168674698e-05, + "loss": 1.4968, + "step": 7150 + }, + { + "epoch": 2.170587342540598, + "grad_norm": 0.6715928912162781, + "learning_rate": 6.385035942087679e-05, + "loss": 1.6004, + "step": 7151 + }, + { + "epoch": 2.1708908787372896, + "grad_norm": 0.7054345607757568, + "learning_rate": 6.384529715500658e-05, + "loss": 1.3363, + "step": 7152 + }, + { + "epoch": 2.171194414933981, + "grad_norm": 0.8027999401092529, + "learning_rate": 6.384023488913638e-05, + "loss": 1.0505, + "step": 7153 + }, + { + "epoch": 2.1714979511306725, + "grad_norm": 0.649910569190979, + "learning_rate": 6.383517262326619e-05, + "loss": 0.8694, + "step": 7154 + }, + { + "epoch": 2.1718014873273637, + "grad_norm": 0.636173665523529, + "learning_rate": 6.383011035739598e-05, + "loss": 1.3469, + "step": 7155 + }, + { + "epoch": 2.1721050235240553, + "grad_norm": 0.8084169030189514, + "learning_rate": 6.382504809152578e-05, + "loss": 1.3881, + "step": 7156 + }, + { + "epoch": 2.1724085597207465, + "grad_norm": 0.6515277028083801, + "learning_rate": 6.381998582565557e-05, + "loss": 1.602, + "step": 7157 + }, + { + "epoch": 2.172712095917438, + "grad_norm": 0.7483789324760437, + "learning_rate": 6.381492355978537e-05, + "loss": 1.5937, + "step": 7158 + }, + { + "epoch": 2.1730156321141294, + "grad_norm": 0.8204941749572754, + "learning_rate": 6.380986129391516e-05, + "loss": 1.3963, + "step": 7159 + }, + { + "epoch": 2.173319168310821, + "grad_norm": 0.7946287393569946, + "learning_rate": 6.380479902804496e-05, + "loss": 0.7602, + "step": 7160 + }, + { + "epoch": 2.1736227045075127, + "grad_norm": 0.6441590189933777, + "learning_rate": 6.379973676217475e-05, + "loss": 1.4581, + "step": 7161 + }, + { + "epoch": 2.173926240704204, + "grad_norm": 0.7281964421272278, + "learning_rate": 6.379467449630455e-05, + "loss": 1.3487, + "step": 7162 + }, + { + "epoch": 2.1742297769008956, + "grad_norm": 0.6523686051368713, + "learning_rate": 6.378961223043435e-05, + "loss": 1.8093, + "step": 7163 + }, + { + "epoch": 2.174533313097587, + "grad_norm": 0.8183250427246094, + "learning_rate": 6.378454996456415e-05, + "loss": 1.395, + "step": 7164 + }, + { + "epoch": 2.1748368492942785, + "grad_norm": 0.7952205538749695, + "learning_rate": 6.377948769869394e-05, + "loss": 1.3464, + "step": 7165 + }, + { + "epoch": 2.1751403854909697, + "grad_norm": 0.6068337559700012, + "learning_rate": 6.377442543282374e-05, + "loss": 1.1278, + "step": 7166 + }, + { + "epoch": 2.1754439216876613, + "grad_norm": 0.8475143909454346, + "learning_rate": 6.376936316695353e-05, + "loss": 1.3051, + "step": 7167 + }, + { + "epoch": 2.1757474578843525, + "grad_norm": 0.6326785683631897, + "learning_rate": 6.376430090108333e-05, + "loss": 1.51, + "step": 7168 + }, + { + "epoch": 2.176050994081044, + "grad_norm": 0.6794092655181885, + "learning_rate": 6.375923863521312e-05, + "loss": 1.4881, + "step": 7169 + }, + { + "epoch": 2.1763545302777354, + "grad_norm": 0.8222758173942566, + "learning_rate": 6.375417636934292e-05, + "loss": 1.4618, + "step": 7170 + }, + { + "epoch": 2.176658066474427, + "grad_norm": 0.7291663885116577, + "learning_rate": 6.374911410347271e-05, + "loss": 0.7921, + "step": 7171 + }, + { + "epoch": 2.1769616026711187, + "grad_norm": 0.6901921033859253, + "learning_rate": 6.374405183760252e-05, + "loss": 1.141, + "step": 7172 + }, + { + "epoch": 2.17726513886781, + "grad_norm": 0.6537145376205444, + "learning_rate": 6.373898957173232e-05, + "loss": 1.2864, + "step": 7173 + }, + { + "epoch": 2.1775686750645016, + "grad_norm": 0.6633203029632568, + "learning_rate": 6.373392730586211e-05, + "loss": 1.5173, + "step": 7174 + }, + { + "epoch": 2.177872211261193, + "grad_norm": 0.8034242987632751, + "learning_rate": 6.37288650399919e-05, + "loss": 0.8577, + "step": 7175 + }, + { + "epoch": 2.1781757474578844, + "grad_norm": 0.6730956435203552, + "learning_rate": 6.37238027741217e-05, + "loss": 1.5462, + "step": 7176 + }, + { + "epoch": 2.1784792836545757, + "grad_norm": 0.8179565668106079, + "learning_rate": 6.37187405082515e-05, + "loss": 0.7864, + "step": 7177 + }, + { + "epoch": 2.1787828198512673, + "grad_norm": 0.8936238884925842, + "learning_rate": 6.371367824238129e-05, + "loss": 1.4166, + "step": 7178 + }, + { + "epoch": 2.1790863560479585, + "grad_norm": 0.9726060032844543, + "learning_rate": 6.370861597651108e-05, + "loss": 0.9226, + "step": 7179 + }, + { + "epoch": 2.17938989224465, + "grad_norm": 0.786594569683075, + "learning_rate": 6.370355371064088e-05, + "loss": 1.3891, + "step": 7180 + }, + { + "epoch": 2.1796934284413414, + "grad_norm": 0.6364801526069641, + "learning_rate": 6.369849144477067e-05, + "loss": 0.9755, + "step": 7181 + }, + { + "epoch": 2.179996964638033, + "grad_norm": 1.0209544897079468, + "learning_rate": 6.369342917890048e-05, + "loss": 1.2559, + "step": 7182 + }, + { + "epoch": 2.1803005008347247, + "grad_norm": 0.5325672626495361, + "learning_rate": 6.368836691303028e-05, + "loss": 0.9392, + "step": 7183 + }, + { + "epoch": 2.180604037031416, + "grad_norm": 0.750744104385376, + "learning_rate": 6.368330464716007e-05, + "loss": 1.1019, + "step": 7184 + }, + { + "epoch": 2.1809075732281076, + "grad_norm": 0.6024854183197021, + "learning_rate": 6.367824238128987e-05, + "loss": 1.0678, + "step": 7185 + }, + { + "epoch": 2.181211109424799, + "grad_norm": 0.6822061538696289, + "learning_rate": 6.367318011541966e-05, + "loss": 1.1547, + "step": 7186 + }, + { + "epoch": 2.1815146456214904, + "grad_norm": 0.6472896933555603, + "learning_rate": 6.366811784954946e-05, + "loss": 1.356, + "step": 7187 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.6070579290390015, + "learning_rate": 6.366305558367925e-05, + "loss": 1.3724, + "step": 7188 + }, + { + "epoch": 2.1821217180148733, + "grad_norm": 0.7931656837463379, + "learning_rate": 6.365799331780905e-05, + "loss": 1.3362, + "step": 7189 + }, + { + "epoch": 2.1824252542115645, + "grad_norm": 0.6661885976791382, + "learning_rate": 6.365293105193884e-05, + "loss": 0.8277, + "step": 7190 + }, + { + "epoch": 2.182728790408256, + "grad_norm": 0.7171643376350403, + "learning_rate": 6.364786878606865e-05, + "loss": 1.148, + "step": 7191 + }, + { + "epoch": 2.183032326604948, + "grad_norm": 0.6635745763778687, + "learning_rate": 6.364280652019844e-05, + "loss": 1.257, + "step": 7192 + }, + { + "epoch": 2.183335862801639, + "grad_norm": 0.7740877270698547, + "learning_rate": 6.363774425432824e-05, + "loss": 1.229, + "step": 7193 + }, + { + "epoch": 2.1836393989983307, + "grad_norm": 0.8438175916671753, + "learning_rate": 6.363268198845803e-05, + "loss": 1.3026, + "step": 7194 + }, + { + "epoch": 2.183942935195022, + "grad_norm": 0.7249647378921509, + "learning_rate": 6.362761972258783e-05, + "loss": 1.4047, + "step": 7195 + }, + { + "epoch": 2.1842464713917136, + "grad_norm": 0.7198625206947327, + "learning_rate": 6.362255745671764e-05, + "loss": 1.6817, + "step": 7196 + }, + { + "epoch": 2.184550007588405, + "grad_norm": 0.7288183569908142, + "learning_rate": 6.361749519084743e-05, + "loss": 1.1093, + "step": 7197 + }, + { + "epoch": 2.1848535437850964, + "grad_norm": 1.0255497694015503, + "learning_rate": 6.361243292497723e-05, + "loss": 0.8903, + "step": 7198 + }, + { + "epoch": 2.1851570799817877, + "grad_norm": 0.6866830587387085, + "learning_rate": 6.360737065910702e-05, + "loss": 1.5924, + "step": 7199 + }, + { + "epoch": 2.1854606161784793, + "grad_norm": 0.7557784914970398, + "learning_rate": 6.360230839323682e-05, + "loss": 1.1523, + "step": 7200 + }, + { + "epoch": 2.185764152375171, + "grad_norm": 0.6118229627609253, + "learning_rate": 6.359724612736661e-05, + "loss": 1.2865, + "step": 7201 + }, + { + "epoch": 2.186067688571862, + "grad_norm": 0.7725751399993896, + "learning_rate": 6.359218386149642e-05, + "loss": 0.4212, + "step": 7202 + }, + { + "epoch": 2.186371224768554, + "grad_norm": 1.019127368927002, + "learning_rate": 6.358712159562621e-05, + "loss": 1.3152, + "step": 7203 + }, + { + "epoch": 2.186674760965245, + "grad_norm": 0.9102368354797363, + "learning_rate": 6.358205932975601e-05, + "loss": 0.4845, + "step": 7204 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.7527662515640259, + "learning_rate": 6.35769970638858e-05, + "loss": 1.3959, + "step": 7205 + }, + { + "epoch": 2.187281833358628, + "grad_norm": 0.6649389266967773, + "learning_rate": 6.35719347980156e-05, + "loss": 1.712, + "step": 7206 + }, + { + "epoch": 2.1875853695553196, + "grad_norm": 0.8035556077957153, + "learning_rate": 6.356687253214539e-05, + "loss": 1.4315, + "step": 7207 + }, + { + "epoch": 2.187888905752011, + "grad_norm": 0.7087796926498413, + "learning_rate": 6.356181026627519e-05, + "loss": 1.526, + "step": 7208 + }, + { + "epoch": 2.1881924419487024, + "grad_norm": 0.7753124833106995, + "learning_rate": 6.355674800040498e-05, + "loss": 1.2894, + "step": 7209 + }, + { + "epoch": 2.1884959781453937, + "grad_norm": 0.7068638205528259, + "learning_rate": 6.355168573453478e-05, + "loss": 1.2161, + "step": 7210 + }, + { + "epoch": 2.1887995143420853, + "grad_norm": 0.7776404619216919, + "learning_rate": 6.354662346866459e-05, + "loss": 1.6053, + "step": 7211 + }, + { + "epoch": 2.189103050538777, + "grad_norm": 0.8074328899383545, + "learning_rate": 6.354156120279438e-05, + "loss": 1.3399, + "step": 7212 + }, + { + "epoch": 2.189406586735468, + "grad_norm": 0.7460588216781616, + "learning_rate": 6.353649893692417e-05, + "loss": 1.4607, + "step": 7213 + }, + { + "epoch": 2.18971012293216, + "grad_norm": 0.6471102833747864, + "learning_rate": 6.353143667105397e-05, + "loss": 1.4977, + "step": 7214 + }, + { + "epoch": 2.190013659128851, + "grad_norm": 0.5782828330993652, + "learning_rate": 6.352637440518376e-05, + "loss": 1.319, + "step": 7215 + }, + { + "epoch": 2.1903171953255427, + "grad_norm": 0.701380729675293, + "learning_rate": 6.352131213931356e-05, + "loss": 1.4726, + "step": 7216 + }, + { + "epoch": 2.190620731522234, + "grad_norm": 0.7160446047782898, + "learning_rate": 6.351624987344335e-05, + "loss": 1.4939, + "step": 7217 + }, + { + "epoch": 2.1909242677189256, + "grad_norm": 0.6674351692199707, + "learning_rate": 6.351118760757315e-05, + "loss": 1.6086, + "step": 7218 + }, + { + "epoch": 2.191227803915617, + "grad_norm": 0.8916682600975037, + "learning_rate": 6.350612534170294e-05, + "loss": 1.362, + "step": 7219 + }, + { + "epoch": 2.1915313401123084, + "grad_norm": 0.683310329914093, + "learning_rate": 6.350106307583274e-05, + "loss": 1.4333, + "step": 7220 + }, + { + "epoch": 2.1918348763089996, + "grad_norm": 0.9091866612434387, + "learning_rate": 6.349600080996255e-05, + "loss": 1.4197, + "step": 7221 + }, + { + "epoch": 2.1921384125056913, + "grad_norm": 0.892817497253418, + "learning_rate": 6.349093854409234e-05, + "loss": 1.1823, + "step": 7222 + }, + { + "epoch": 2.192441948702383, + "grad_norm": 0.7482536435127258, + "learning_rate": 6.348587627822214e-05, + "loss": 1.1527, + "step": 7223 + }, + { + "epoch": 2.192745484899074, + "grad_norm": 0.6525892615318298, + "learning_rate": 6.348081401235193e-05, + "loss": 1.4811, + "step": 7224 + }, + { + "epoch": 2.193049021095766, + "grad_norm": 0.6662126183509827, + "learning_rate": 6.347575174648173e-05, + "loss": 1.5839, + "step": 7225 + }, + { + "epoch": 2.193352557292457, + "grad_norm": 0.7835372090339661, + "learning_rate": 6.347068948061152e-05, + "loss": 0.8512, + "step": 7226 + }, + { + "epoch": 2.1936560934891487, + "grad_norm": 0.6281005144119263, + "learning_rate": 6.346562721474132e-05, + "loss": 1.5767, + "step": 7227 + }, + { + "epoch": 2.19395962968584, + "grad_norm": 0.8295223116874695, + "learning_rate": 6.346056494887111e-05, + "loss": 1.3427, + "step": 7228 + }, + { + "epoch": 2.1942631658825316, + "grad_norm": 1.0514887571334839, + "learning_rate": 6.34555026830009e-05, + "loss": 1.2373, + "step": 7229 + }, + { + "epoch": 2.1945667020792228, + "grad_norm": 0.7697010040283203, + "learning_rate": 6.345044041713071e-05, + "loss": 1.1417, + "step": 7230 + }, + { + "epoch": 2.1948702382759144, + "grad_norm": 0.6578774452209473, + "learning_rate": 6.344537815126051e-05, + "loss": 0.9744, + "step": 7231 + }, + { + "epoch": 2.1951737744726056, + "grad_norm": 0.904004693031311, + "learning_rate": 6.34403158853903e-05, + "loss": 0.9456, + "step": 7232 + }, + { + "epoch": 2.1954773106692973, + "grad_norm": 0.7968248724937439, + "learning_rate": 6.34352536195201e-05, + "loss": 1.3675, + "step": 7233 + }, + { + "epoch": 2.195780846865989, + "grad_norm": 0.8919456005096436, + "learning_rate": 6.343019135364989e-05, + "loss": 1.4872, + "step": 7234 + }, + { + "epoch": 2.19608438306268, + "grad_norm": 0.7745645046234131, + "learning_rate": 6.342512908777969e-05, + "loss": 1.2568, + "step": 7235 + }, + { + "epoch": 2.196387919259372, + "grad_norm": 0.7691221237182617, + "learning_rate": 6.342006682190948e-05, + "loss": 1.3545, + "step": 7236 + }, + { + "epoch": 2.196691455456063, + "grad_norm": 0.8532097935676575, + "learning_rate": 6.341500455603928e-05, + "loss": 1.1938, + "step": 7237 + }, + { + "epoch": 2.1969949916527547, + "grad_norm": 0.7357925176620483, + "learning_rate": 6.340994229016907e-05, + "loss": 1.7771, + "step": 7238 + }, + { + "epoch": 2.197298527849446, + "grad_norm": 0.718250036239624, + "learning_rate": 6.340488002429888e-05, + "loss": 0.7017, + "step": 7239 + }, + { + "epoch": 2.1976020640461376, + "grad_norm": 0.752316415309906, + "learning_rate": 6.339981775842868e-05, + "loss": 0.9411, + "step": 7240 + }, + { + "epoch": 2.1979056002428288, + "grad_norm": 0.710756242275238, + "learning_rate": 6.339475549255848e-05, + "loss": 1.1259, + "step": 7241 + }, + { + "epoch": 2.1982091364395204, + "grad_norm": 0.728549063205719, + "learning_rate": 6.338969322668828e-05, + "loss": 0.9745, + "step": 7242 + }, + { + "epoch": 2.1985126726362116, + "grad_norm": 0.8064560294151306, + "learning_rate": 6.338463096081807e-05, + "loss": 1.3108, + "step": 7243 + }, + { + "epoch": 2.1988162088329033, + "grad_norm": 0.7704907655715942, + "learning_rate": 6.337956869494787e-05, + "loss": 1.483, + "step": 7244 + }, + { + "epoch": 2.199119745029595, + "grad_norm": 0.8112806677818298, + "learning_rate": 6.337450642907766e-05, + "loss": 1.2775, + "step": 7245 + }, + { + "epoch": 2.199423281226286, + "grad_norm": 0.687940239906311, + "learning_rate": 6.336944416320746e-05, + "loss": 1.5365, + "step": 7246 + }, + { + "epoch": 2.199726817422978, + "grad_norm": 0.8638631701469421, + "learning_rate": 6.336438189733725e-05, + "loss": 0.903, + "step": 7247 + }, + { + "epoch": 2.200030353619669, + "grad_norm": 0.8518549203872681, + "learning_rate": 6.335931963146705e-05, + "loss": 1.2908, + "step": 7248 + }, + { + "epoch": 2.2003338898163607, + "grad_norm": 0.5747170448303223, + "learning_rate": 6.335425736559684e-05, + "loss": 1.6174, + "step": 7249 + }, + { + "epoch": 2.200637426013052, + "grad_norm": 0.7496510148048401, + "learning_rate": 6.334919509972665e-05, + "loss": 1.5804, + "step": 7250 + }, + { + "epoch": 2.2009409622097436, + "grad_norm": 0.7555733919143677, + "learning_rate": 6.334413283385644e-05, + "loss": 1.261, + "step": 7251 + }, + { + "epoch": 2.2012444984064348, + "grad_norm": 0.7829087972640991, + "learning_rate": 6.333907056798624e-05, + "loss": 0.8846, + "step": 7252 + }, + { + "epoch": 2.2015480346031264, + "grad_norm": 0.7930013537406921, + "learning_rate": 6.333400830211603e-05, + "loss": 1.4279, + "step": 7253 + }, + { + "epoch": 2.201851570799818, + "grad_norm": 0.8092796802520752, + "learning_rate": 6.332894603624583e-05, + "loss": 0.9596, + "step": 7254 + }, + { + "epoch": 2.2021551069965093, + "grad_norm": 0.7831472158432007, + "learning_rate": 6.332388377037562e-05, + "loss": 1.5345, + "step": 7255 + }, + { + "epoch": 2.202458643193201, + "grad_norm": 0.6508380174636841, + "learning_rate": 6.331882150450542e-05, + "loss": 1.7227, + "step": 7256 + }, + { + "epoch": 2.202762179389892, + "grad_norm": 0.9539698362350464, + "learning_rate": 6.331375923863521e-05, + "loss": 1.133, + "step": 7257 + }, + { + "epoch": 2.203065715586584, + "grad_norm": 0.8370058536529541, + "learning_rate": 6.330869697276501e-05, + "loss": 1.3995, + "step": 7258 + }, + { + "epoch": 2.203369251783275, + "grad_norm": 0.7775351405143738, + "learning_rate": 6.33036347068948e-05, + "loss": 1.4441, + "step": 7259 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.5709270238876343, + "learning_rate": 6.329857244102461e-05, + "loss": 1.2477, + "step": 7260 + }, + { + "epoch": 2.203976324176658, + "grad_norm": 0.8071433901786804, + "learning_rate": 6.32935101751544e-05, + "loss": 0.9049, + "step": 7261 + }, + { + "epoch": 2.2042798603733496, + "grad_norm": 0.831382155418396, + "learning_rate": 6.32884479092842e-05, + "loss": 1.0145, + "step": 7262 + }, + { + "epoch": 2.204583396570041, + "grad_norm": 2.287936210632324, + "learning_rate": 6.3283385643414e-05, + "loss": 1.6595, + "step": 7263 + }, + { + "epoch": 2.2048869327667324, + "grad_norm": 0.7208482623100281, + "learning_rate": 6.327832337754379e-05, + "loss": 0.7489, + "step": 7264 + }, + { + "epoch": 2.205190468963424, + "grad_norm": 1.0533088445663452, + "learning_rate": 6.327326111167359e-05, + "loss": 1.4194, + "step": 7265 + }, + { + "epoch": 2.2054940051601153, + "grad_norm": 0.5085039734840393, + "learning_rate": 6.326819884580338e-05, + "loss": 1.5545, + "step": 7266 + }, + { + "epoch": 2.205797541356807, + "grad_norm": 0.792374849319458, + "learning_rate": 6.326313657993318e-05, + "loss": 1.2997, + "step": 7267 + }, + { + "epoch": 2.206101077553498, + "grad_norm": 0.6318290829658508, + "learning_rate": 6.325807431406297e-05, + "loss": 1.0966, + "step": 7268 + }, + { + "epoch": 2.20640461375019, + "grad_norm": 0.7878068089485168, + "learning_rate": 6.325301204819278e-05, + "loss": 0.932, + "step": 7269 + }, + { + "epoch": 2.206708149946881, + "grad_norm": 0.6690864562988281, + "learning_rate": 6.324794978232257e-05, + "loss": 1.2388, + "step": 7270 + }, + { + "epoch": 2.2070116861435727, + "grad_norm": 0.6518906354904175, + "learning_rate": 6.324288751645237e-05, + "loss": 1.3831, + "step": 7271 + }, + { + "epoch": 2.207315222340264, + "grad_norm": 0.7100988030433655, + "learning_rate": 6.323782525058216e-05, + "loss": 1.2587, + "step": 7272 + }, + { + "epoch": 2.2076187585369555, + "grad_norm": 1.214737892150879, + "learning_rate": 6.323276298471196e-05, + "loss": 0.9016, + "step": 7273 + }, + { + "epoch": 2.207922294733647, + "grad_norm": 0.8014380931854248, + "learning_rate": 6.322770071884175e-05, + "loss": 1.2304, + "step": 7274 + }, + { + "epoch": 2.2082258309303384, + "grad_norm": 0.7462916374206543, + "learning_rate": 6.322263845297155e-05, + "loss": 0.8393, + "step": 7275 + }, + { + "epoch": 2.20852936712703, + "grad_norm": 0.8555428385734558, + "learning_rate": 6.321757618710134e-05, + "loss": 1.3557, + "step": 7276 + }, + { + "epoch": 2.2088329033237213, + "grad_norm": 0.6086410284042358, + "learning_rate": 6.321251392123114e-05, + "loss": 1.2881, + "step": 7277 + }, + { + "epoch": 2.209136439520413, + "grad_norm": 0.6963145732879639, + "learning_rate": 6.320745165536094e-05, + "loss": 1.1412, + "step": 7278 + }, + { + "epoch": 2.209439975717104, + "grad_norm": 0.7891445159912109, + "learning_rate": 6.320238938949074e-05, + "loss": 1.2458, + "step": 7279 + }, + { + "epoch": 2.209743511913796, + "grad_norm": 0.7201794385910034, + "learning_rate": 6.319732712362053e-05, + "loss": 1.4347, + "step": 7280 + }, + { + "epoch": 2.210047048110487, + "grad_norm": 0.7422575950622559, + "learning_rate": 6.319226485775033e-05, + "loss": 1.6253, + "step": 7281 + }, + { + "epoch": 2.2103505843071787, + "grad_norm": 0.6944773197174072, + "learning_rate": 6.318720259188012e-05, + "loss": 1.2173, + "step": 7282 + }, + { + "epoch": 2.21065412050387, + "grad_norm": 0.7755913734436035, + "learning_rate": 6.318214032600992e-05, + "loss": 1.2927, + "step": 7283 + }, + { + "epoch": 2.2109576567005615, + "grad_norm": 0.7552181482315063, + "learning_rate": 6.317707806013971e-05, + "loss": 1.1499, + "step": 7284 + }, + { + "epoch": 2.211261192897253, + "grad_norm": 1.007400631904602, + "learning_rate": 6.317201579426952e-05, + "loss": 1.6549, + "step": 7285 + }, + { + "epoch": 2.2115647290939444, + "grad_norm": 0.8183742165565491, + "learning_rate": 6.316695352839932e-05, + "loss": 1.3761, + "step": 7286 + }, + { + "epoch": 2.211868265290636, + "grad_norm": 0.7423942685127258, + "learning_rate": 6.316189126252911e-05, + "loss": 1.062, + "step": 7287 + }, + { + "epoch": 2.2121718014873273, + "grad_norm": 0.8269255757331848, + "learning_rate": 6.31568289966589e-05, + "loss": 1.1571, + "step": 7288 + }, + { + "epoch": 2.212475337684019, + "grad_norm": 0.8049871921539307, + "learning_rate": 6.315176673078871e-05, + "loss": 1.1371, + "step": 7289 + }, + { + "epoch": 2.21277887388071, + "grad_norm": 0.6453428268432617, + "learning_rate": 6.314670446491851e-05, + "loss": 1.4766, + "step": 7290 + }, + { + "epoch": 2.213082410077402, + "grad_norm": 0.8065813183784485, + "learning_rate": 6.31416421990483e-05, + "loss": 1.5128, + "step": 7291 + }, + { + "epoch": 2.213385946274093, + "grad_norm": 0.749000608921051, + "learning_rate": 6.31365799331781e-05, + "loss": 1.2478, + "step": 7292 + }, + { + "epoch": 2.2136894824707847, + "grad_norm": 0.7806560397148132, + "learning_rate": 6.31315176673079e-05, + "loss": 1.6537, + "step": 7293 + }, + { + "epoch": 2.213993018667476, + "grad_norm": 0.8572752475738525, + "learning_rate": 6.312645540143769e-05, + "loss": 1.2017, + "step": 7294 + }, + { + "epoch": 2.2142965548641675, + "grad_norm": 0.652068018913269, + "learning_rate": 6.312139313556748e-05, + "loss": 1.6904, + "step": 7295 + }, + { + "epoch": 2.214600091060859, + "grad_norm": 0.6921754479408264, + "learning_rate": 6.311633086969728e-05, + "loss": 1.1166, + "step": 7296 + }, + { + "epoch": 2.2149036272575504, + "grad_norm": 0.7487016320228577, + "learning_rate": 6.311126860382707e-05, + "loss": 1.5376, + "step": 7297 + }, + { + "epoch": 2.215207163454242, + "grad_norm": 0.8527117967605591, + "learning_rate": 6.310620633795687e-05, + "loss": 1.2639, + "step": 7298 + }, + { + "epoch": 2.2155106996509333, + "grad_norm": 0.8065406084060669, + "learning_rate": 6.310114407208668e-05, + "loss": 1.2687, + "step": 7299 + }, + { + "epoch": 2.215814235847625, + "grad_norm": 0.8539122343063354, + "learning_rate": 6.309608180621647e-05, + "loss": 1.4641, + "step": 7300 + }, + { + "epoch": 2.216117772044316, + "grad_norm": 0.8697571158409119, + "learning_rate": 6.309101954034627e-05, + "loss": 0.8682, + "step": 7301 + }, + { + "epoch": 2.216421308241008, + "grad_norm": 0.6010316610336304, + "learning_rate": 6.308595727447606e-05, + "loss": 1.2321, + "step": 7302 + }, + { + "epoch": 2.216724844437699, + "grad_norm": 0.5963598489761353, + "learning_rate": 6.308089500860586e-05, + "loss": 1.2184, + "step": 7303 + }, + { + "epoch": 2.2170283806343907, + "grad_norm": 0.6307588815689087, + "learning_rate": 6.307583274273565e-05, + "loss": 1.2, + "step": 7304 + }, + { + "epoch": 2.217331916831082, + "grad_norm": 0.7107384204864502, + "learning_rate": 6.307077047686545e-05, + "loss": 1.3101, + "step": 7305 + }, + { + "epoch": 2.2176354530277735, + "grad_norm": 0.5359188318252563, + "learning_rate": 6.306570821099524e-05, + "loss": 1.652, + "step": 7306 + }, + { + "epoch": 2.217938989224465, + "grad_norm": 0.617853045463562, + "learning_rate": 6.306064594512503e-05, + "loss": 1.5773, + "step": 7307 + }, + { + "epoch": 2.2182425254211564, + "grad_norm": 0.7007902264595032, + "learning_rate": 6.305558367925484e-05, + "loss": 1.3057, + "step": 7308 + }, + { + "epoch": 2.218546061617848, + "grad_norm": 0.7740719318389893, + "learning_rate": 6.305052141338464e-05, + "loss": 1.4049, + "step": 7309 + }, + { + "epoch": 2.2188495978145393, + "grad_norm": 1.0561450719833374, + "learning_rate": 6.304545914751443e-05, + "loss": 1.0139, + "step": 7310 + }, + { + "epoch": 2.219153134011231, + "grad_norm": 0.5591197609901428, + "learning_rate": 6.304039688164423e-05, + "loss": 1.0521, + "step": 7311 + }, + { + "epoch": 2.219456670207922, + "grad_norm": 0.7620384097099304, + "learning_rate": 6.303533461577402e-05, + "loss": 1.5936, + "step": 7312 + }, + { + "epoch": 2.219760206404614, + "grad_norm": 0.4855552911758423, + "learning_rate": 6.303027234990382e-05, + "loss": 1.0651, + "step": 7313 + }, + { + "epoch": 2.220063742601305, + "grad_norm": 0.7248707413673401, + "learning_rate": 6.302521008403361e-05, + "loss": 1.1932, + "step": 7314 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.9377697110176086, + "learning_rate": 6.30201478181634e-05, + "loss": 1.0397, + "step": 7315 + }, + { + "epoch": 2.2206708149946883, + "grad_norm": 0.6386867761611938, + "learning_rate": 6.30150855522932e-05, + "loss": 1.5274, + "step": 7316 + }, + { + "epoch": 2.2209743511913795, + "grad_norm": 0.7806806564331055, + "learning_rate": 6.301002328642301e-05, + "loss": 1.4235, + "step": 7317 + }, + { + "epoch": 2.221277887388071, + "grad_norm": 0.7374126315116882, + "learning_rate": 6.30049610205528e-05, + "loss": 1.7768, + "step": 7318 + }, + { + "epoch": 2.2215814235847624, + "grad_norm": 0.7384335994720459, + "learning_rate": 6.29998987546826e-05, + "loss": 1.617, + "step": 7319 + }, + { + "epoch": 2.221884959781454, + "grad_norm": 0.6326069831848145, + "learning_rate": 6.29948364888124e-05, + "loss": 1.4716, + "step": 7320 + }, + { + "epoch": 2.2221884959781453, + "grad_norm": 0.8205044269561768, + "learning_rate": 6.298977422294219e-05, + "loss": 1.1083, + "step": 7321 + }, + { + "epoch": 2.222492032174837, + "grad_norm": 0.6621566414833069, + "learning_rate": 6.298471195707198e-05, + "loss": 1.1608, + "step": 7322 + }, + { + "epoch": 2.222795568371528, + "grad_norm": 0.6662036776542664, + "learning_rate": 6.297964969120178e-05, + "loss": 1.6083, + "step": 7323 + }, + { + "epoch": 2.22309910456822, + "grad_norm": 0.9175953269004822, + "learning_rate": 6.297458742533157e-05, + "loss": 1.2642, + "step": 7324 + }, + { + "epoch": 2.223402640764911, + "grad_norm": 0.6709551811218262, + "learning_rate": 6.296952515946137e-05, + "loss": 0.9856, + "step": 7325 + }, + { + "epoch": 2.2237061769616027, + "grad_norm": 0.7839862704277039, + "learning_rate": 6.296446289359116e-05, + "loss": 1.5064, + "step": 7326 + }, + { + "epoch": 2.2240097131582943, + "grad_norm": 0.7650319933891296, + "learning_rate": 6.295940062772097e-05, + "loss": 1.2924, + "step": 7327 + }, + { + "epoch": 2.2243132493549855, + "grad_norm": 0.5967111587524414, + "learning_rate": 6.295433836185077e-05, + "loss": 1.3916, + "step": 7328 + }, + { + "epoch": 2.224616785551677, + "grad_norm": 0.9305431842803955, + "learning_rate": 6.294927609598057e-05, + "loss": 1.4844, + "step": 7329 + }, + { + "epoch": 2.2249203217483684, + "grad_norm": 0.8193114399909973, + "learning_rate": 6.294421383011037e-05, + "loss": 1.3932, + "step": 7330 + }, + { + "epoch": 2.22522385794506, + "grad_norm": 0.6530084013938904, + "learning_rate": 6.293915156424016e-05, + "loss": 1.4033, + "step": 7331 + }, + { + "epoch": 2.2255273941417513, + "grad_norm": 0.7277489304542542, + "learning_rate": 6.293408929836996e-05, + "loss": 0.9039, + "step": 7332 + }, + { + "epoch": 2.225830930338443, + "grad_norm": 0.7713912129402161, + "learning_rate": 6.292902703249975e-05, + "loss": 1.4569, + "step": 7333 + }, + { + "epoch": 2.226134466535134, + "grad_norm": 0.5298659205436707, + "learning_rate": 6.292396476662955e-05, + "loss": 1.2005, + "step": 7334 + }, + { + "epoch": 2.226438002731826, + "grad_norm": 0.6997772455215454, + "learning_rate": 6.291890250075934e-05, + "loss": 1.4526, + "step": 7335 + }, + { + "epoch": 2.2267415389285174, + "grad_norm": 0.724656879901886, + "learning_rate": 6.291384023488914e-05, + "loss": 1.4197, + "step": 7336 + }, + { + "epoch": 2.2270450751252087, + "grad_norm": 0.6488526463508606, + "learning_rate": 6.290877796901893e-05, + "loss": 1.8035, + "step": 7337 + }, + { + "epoch": 2.2273486113219003, + "grad_norm": 0.6727263331413269, + "learning_rate": 6.290371570314874e-05, + "loss": 1.0752, + "step": 7338 + }, + { + "epoch": 2.2276521475185915, + "grad_norm": 0.8429280519485474, + "learning_rate": 6.289865343727854e-05, + "loss": 1.4221, + "step": 7339 + }, + { + "epoch": 2.227955683715283, + "grad_norm": 0.8624864816665649, + "learning_rate": 6.289359117140833e-05, + "loss": 1.325, + "step": 7340 + }, + { + "epoch": 2.2282592199119744, + "grad_norm": 0.7922910451889038, + "learning_rate": 6.288852890553813e-05, + "loss": 1.347, + "step": 7341 + }, + { + "epoch": 2.228562756108666, + "grad_norm": 0.7603941559791565, + "learning_rate": 6.288346663966792e-05, + "loss": 1.3607, + "step": 7342 + }, + { + "epoch": 2.2288662923053573, + "grad_norm": 0.869234025478363, + "learning_rate": 6.287840437379772e-05, + "loss": 1.2962, + "step": 7343 + }, + { + "epoch": 2.229169828502049, + "grad_norm": 0.7643560767173767, + "learning_rate": 6.287334210792751e-05, + "loss": 1.3782, + "step": 7344 + }, + { + "epoch": 2.22947336469874, + "grad_norm": 0.6984087824821472, + "learning_rate": 6.28682798420573e-05, + "loss": 1.1741, + "step": 7345 + }, + { + "epoch": 2.229776900895432, + "grad_norm": 0.8322027921676636, + "learning_rate": 6.28632175761871e-05, + "loss": 1.5343, + "step": 7346 + }, + { + "epoch": 2.2300804370921234, + "grad_norm": 0.5010148882865906, + "learning_rate": 6.285815531031691e-05, + "loss": 1.5332, + "step": 7347 + }, + { + "epoch": 2.2303839732888147, + "grad_norm": 0.8717657923698425, + "learning_rate": 6.28530930444467e-05, + "loss": 1.3991, + "step": 7348 + }, + { + "epoch": 2.2306875094855063, + "grad_norm": 0.6880953311920166, + "learning_rate": 6.28480307785765e-05, + "loss": 1.2642, + "step": 7349 + }, + { + "epoch": 2.2309910456821975, + "grad_norm": 0.6598102450370789, + "learning_rate": 6.284296851270629e-05, + "loss": 1.7148, + "step": 7350 + }, + { + "epoch": 2.231294581878889, + "grad_norm": 0.6542856693267822, + "learning_rate": 6.283790624683609e-05, + "loss": 1.4979, + "step": 7351 + }, + { + "epoch": 2.2315981180755804, + "grad_norm": 1.0812065601348877, + "learning_rate": 6.283284398096588e-05, + "loss": 1.6166, + "step": 7352 + }, + { + "epoch": 2.231901654272272, + "grad_norm": 0.9855359792709351, + "learning_rate": 6.282778171509568e-05, + "loss": 0.8146, + "step": 7353 + }, + { + "epoch": 2.2322051904689633, + "grad_norm": 0.818660318851471, + "learning_rate": 6.282271944922547e-05, + "loss": 1.3215, + "step": 7354 + }, + { + "epoch": 2.232508726665655, + "grad_norm": 0.5664464235305786, + "learning_rate": 6.281765718335527e-05, + "loss": 1.448, + "step": 7355 + }, + { + "epoch": 2.232812262862346, + "grad_norm": 0.786517858505249, + "learning_rate": 6.281259491748507e-05, + "loss": 1.4184, + "step": 7356 + }, + { + "epoch": 2.233115799059038, + "grad_norm": 0.7664825320243835, + "learning_rate": 6.280753265161487e-05, + "loss": 1.5741, + "step": 7357 + }, + { + "epoch": 2.2334193352557294, + "grad_norm": 0.7034837603569031, + "learning_rate": 6.280247038574466e-05, + "loss": 1.357, + "step": 7358 + }, + { + "epoch": 2.2337228714524207, + "grad_norm": 0.6958170533180237, + "learning_rate": 6.279740811987446e-05, + "loss": 1.4679, + "step": 7359 + }, + { + "epoch": 2.2340264076491123, + "grad_norm": 0.5791650414466858, + "learning_rate": 6.279234585400425e-05, + "loss": 1.6852, + "step": 7360 + }, + { + "epoch": 2.2343299438458035, + "grad_norm": 0.6118115782737732, + "learning_rate": 6.278728358813405e-05, + "loss": 1.2855, + "step": 7361 + }, + { + "epoch": 2.234633480042495, + "grad_norm": 0.6189674735069275, + "learning_rate": 6.278222132226384e-05, + "loss": 1.1345, + "step": 7362 + }, + { + "epoch": 2.2349370162391864, + "grad_norm": 0.803203821182251, + "learning_rate": 6.277715905639364e-05, + "loss": 1.1459, + "step": 7363 + }, + { + "epoch": 2.235240552435878, + "grad_norm": 0.7166264057159424, + "learning_rate": 6.277209679052343e-05, + "loss": 1.7317, + "step": 7364 + }, + { + "epoch": 2.2355440886325693, + "grad_norm": 0.7164594531059265, + "learning_rate": 6.276703452465323e-05, + "loss": 0.9119, + "step": 7365 + }, + { + "epoch": 2.235847624829261, + "grad_norm": 0.7110953330993652, + "learning_rate": 6.276197225878304e-05, + "loss": 1.5785, + "step": 7366 + }, + { + "epoch": 2.236151161025952, + "grad_norm": 0.8318694233894348, + "learning_rate": 6.275690999291283e-05, + "loss": 1.2255, + "step": 7367 + }, + { + "epoch": 2.236454697222644, + "grad_norm": 0.6412427425384521, + "learning_rate": 6.275184772704263e-05, + "loss": 1.0238, + "step": 7368 + }, + { + "epoch": 2.2367582334193354, + "grad_norm": 0.8117619752883911, + "learning_rate": 6.274678546117242e-05, + "loss": 1.3353, + "step": 7369 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5850114822387695, + "learning_rate": 6.274172319530222e-05, + "loss": 1.6376, + "step": 7370 + }, + { + "epoch": 2.2373653058127183, + "grad_norm": 0.7341693639755249, + "learning_rate": 6.273666092943201e-05, + "loss": 1.2459, + "step": 7371 + }, + { + "epoch": 2.2376688420094095, + "grad_norm": 0.8353118896484375, + "learning_rate": 6.27315986635618e-05, + "loss": 1.1781, + "step": 7372 + }, + { + "epoch": 2.237972378206101, + "grad_norm": 0.7113072872161865, + "learning_rate": 6.27265363976916e-05, + "loss": 1.3433, + "step": 7373 + }, + { + "epoch": 2.2382759144027924, + "grad_norm": 0.681052565574646, + "learning_rate": 6.272147413182141e-05, + "loss": 1.4596, + "step": 7374 + }, + { + "epoch": 2.238579450599484, + "grad_norm": 0.8312979340553284, + "learning_rate": 6.27164118659512e-05, + "loss": 1.3036, + "step": 7375 + }, + { + "epoch": 2.2388829867961753, + "grad_norm": 1.068250298500061, + "learning_rate": 6.2711349600081e-05, + "loss": 1.1011, + "step": 7376 + }, + { + "epoch": 2.239186522992867, + "grad_norm": 0.831622838973999, + "learning_rate": 6.27062873342108e-05, + "loss": 1.1204, + "step": 7377 + }, + { + "epoch": 2.239490059189558, + "grad_norm": 0.6426302790641785, + "learning_rate": 6.27012250683406e-05, + "loss": 1.7318, + "step": 7378 + }, + { + "epoch": 2.23979359538625, + "grad_norm": 0.8913399577140808, + "learning_rate": 6.26961628024704e-05, + "loss": 1.414, + "step": 7379 + }, + { + "epoch": 2.2400971315829414, + "grad_norm": 0.827407717704773, + "learning_rate": 6.269110053660019e-05, + "loss": 1.3879, + "step": 7380 + }, + { + "epoch": 2.2404006677796326, + "grad_norm": 0.6446571350097656, + "learning_rate": 6.268603827072999e-05, + "loss": 1.6254, + "step": 7381 + }, + { + "epoch": 2.2407042039763243, + "grad_norm": 0.820245623588562, + "learning_rate": 6.268097600485978e-05, + "loss": 1.2645, + "step": 7382 + }, + { + "epoch": 2.2410077401730155, + "grad_norm": 0.9686972498893738, + "learning_rate": 6.267591373898957e-05, + "loss": 1.0926, + "step": 7383 + }, + { + "epoch": 2.241311276369707, + "grad_norm": 0.6719053387641907, + "learning_rate": 6.267085147311937e-05, + "loss": 1.2106, + "step": 7384 + }, + { + "epoch": 2.2416148125663984, + "grad_norm": 0.7990976572036743, + "learning_rate": 6.266578920724916e-05, + "loss": 1.3629, + "step": 7385 + }, + { + "epoch": 2.24191834876309, + "grad_norm": 0.8770137429237366, + "learning_rate": 6.266072694137897e-05, + "loss": 1.1797, + "step": 7386 + }, + { + "epoch": 2.2422218849597813, + "grad_norm": 0.8285596370697021, + "learning_rate": 6.265566467550877e-05, + "loss": 0.7679, + "step": 7387 + }, + { + "epoch": 2.242525421156473, + "grad_norm": 0.8647116422653198, + "learning_rate": 6.265060240963856e-05, + "loss": 0.9769, + "step": 7388 + }, + { + "epoch": 2.2428289573531646, + "grad_norm": 0.9097809791564941, + "learning_rate": 6.264554014376836e-05, + "loss": 0.9809, + "step": 7389 + }, + { + "epoch": 2.2431324935498558, + "grad_norm": 0.7164098620414734, + "learning_rate": 6.264047787789815e-05, + "loss": 1.7244, + "step": 7390 + }, + { + "epoch": 2.2434360297465474, + "grad_norm": 0.7220231890678406, + "learning_rate": 6.263541561202795e-05, + "loss": 1.3302, + "step": 7391 + }, + { + "epoch": 2.2437395659432386, + "grad_norm": 0.7721933722496033, + "learning_rate": 6.263035334615774e-05, + "loss": 0.8151, + "step": 7392 + }, + { + "epoch": 2.2440431021399303, + "grad_norm": 0.6171678304672241, + "learning_rate": 6.262529108028754e-05, + "loss": 1.5075, + "step": 7393 + }, + { + "epoch": 2.2443466383366215, + "grad_norm": 1.0465861558914185, + "learning_rate": 6.262022881441733e-05, + "loss": 1.2764, + "step": 7394 + }, + { + "epoch": 2.244650174533313, + "grad_norm": 0.7800273299217224, + "learning_rate": 6.261516654854714e-05, + "loss": 0.9609, + "step": 7395 + }, + { + "epoch": 2.2449537107300044, + "grad_norm": 0.7144536972045898, + "learning_rate": 6.261010428267693e-05, + "loss": 1.1818, + "step": 7396 + }, + { + "epoch": 2.245257246926696, + "grad_norm": 0.7565951943397522, + "learning_rate": 6.260504201680673e-05, + "loss": 1.3815, + "step": 7397 + }, + { + "epoch": 2.2455607831233877, + "grad_norm": 0.689526379108429, + "learning_rate": 6.259997975093652e-05, + "loss": 1.423, + "step": 7398 + }, + { + "epoch": 2.245864319320079, + "grad_norm": 0.6433810591697693, + "learning_rate": 6.259491748506632e-05, + "loss": 1.5474, + "step": 7399 + }, + { + "epoch": 2.2461678555167706, + "grad_norm": 0.5929258465766907, + "learning_rate": 6.258985521919611e-05, + "loss": 1.1298, + "step": 7400 + }, + { + "epoch": 2.2464713917134618, + "grad_norm": 0.7315686941146851, + "learning_rate": 6.258479295332591e-05, + "loss": 1.5642, + "step": 7401 + }, + { + "epoch": 2.2467749279101534, + "grad_norm": 0.7189039587974548, + "learning_rate": 6.25797306874557e-05, + "loss": 0.8376, + "step": 7402 + }, + { + "epoch": 2.2470784641068446, + "grad_norm": 0.7872570157051086, + "learning_rate": 6.25746684215855e-05, + "loss": 1.3381, + "step": 7403 + }, + { + "epoch": 2.2473820003035363, + "grad_norm": 0.5732450485229492, + "learning_rate": 6.256960615571529e-05, + "loss": 1.5919, + "step": 7404 + }, + { + "epoch": 2.2476855365002275, + "grad_norm": 0.7429735660552979, + "learning_rate": 6.25645438898451e-05, + "loss": 1.221, + "step": 7405 + }, + { + "epoch": 2.247989072696919, + "grad_norm": 0.8105106353759766, + "learning_rate": 6.25594816239749e-05, + "loss": 1.2708, + "step": 7406 + }, + { + "epoch": 2.2482926088936104, + "grad_norm": 0.779171884059906, + "learning_rate": 6.255441935810469e-05, + "loss": 1.3488, + "step": 7407 + }, + { + "epoch": 2.248596145090302, + "grad_norm": 0.5718879699707031, + "learning_rate": 6.254935709223449e-05, + "loss": 1.8277, + "step": 7408 + }, + { + "epoch": 2.2488996812869937, + "grad_norm": 0.6967871785163879, + "learning_rate": 6.254429482636428e-05, + "loss": 1.4693, + "step": 7409 + }, + { + "epoch": 2.249203217483685, + "grad_norm": 0.7380461692810059, + "learning_rate": 6.253923256049407e-05, + "loss": 1.2631, + "step": 7410 + }, + { + "epoch": 2.2495067536803766, + "grad_norm": 0.7310465574264526, + "learning_rate": 6.253417029462387e-05, + "loss": 1.2298, + "step": 7411 + }, + { + "epoch": 2.2498102898770678, + "grad_norm": 0.6302371621131897, + "learning_rate": 6.252910802875366e-05, + "loss": 1.616, + "step": 7412 + }, + { + "epoch": 2.2501138260737594, + "grad_norm": 0.8418757915496826, + "learning_rate": 6.252404576288346e-05, + "loss": 1.1698, + "step": 7413 + }, + { + "epoch": 2.2504173622704506, + "grad_norm": 0.823052167892456, + "learning_rate": 6.251898349701327e-05, + "loss": 1.3639, + "step": 7414 + }, + { + "epoch": 2.2507208984671423, + "grad_norm": 0.6069715619087219, + "learning_rate": 6.251392123114306e-05, + "loss": 1.3813, + "step": 7415 + }, + { + "epoch": 2.2510244346638335, + "grad_norm": 0.7191272377967834, + "learning_rate": 6.250885896527286e-05, + "loss": 1.3083, + "step": 7416 + }, + { + "epoch": 2.251327970860525, + "grad_norm": 0.6589117050170898, + "learning_rate": 6.250379669940265e-05, + "loss": 1.5802, + "step": 7417 + }, + { + "epoch": 2.2516315070572164, + "grad_norm": 0.8188351392745972, + "learning_rate": 6.249873443353246e-05, + "loss": 1.5078, + "step": 7418 + }, + { + "epoch": 2.251935043253908, + "grad_norm": 0.8425248861312866, + "learning_rate": 6.249367216766225e-05, + "loss": 1.4421, + "step": 7419 + }, + { + "epoch": 2.2522385794505997, + "grad_norm": 0.7744620442390442, + "learning_rate": 6.248860990179205e-05, + "loss": 1.3202, + "step": 7420 + }, + { + "epoch": 2.252542115647291, + "grad_norm": 0.8308604955673218, + "learning_rate": 6.248354763592184e-05, + "loss": 1.3473, + "step": 7421 + }, + { + "epoch": 2.2528456518439826, + "grad_norm": 0.9154324531555176, + "learning_rate": 6.247848537005164e-05, + "loss": 1.4875, + "step": 7422 + }, + { + "epoch": 2.2531491880406738, + "grad_norm": 0.8225959539413452, + "learning_rate": 6.247342310418143e-05, + "loss": 1.5585, + "step": 7423 + }, + { + "epoch": 2.2534527242373654, + "grad_norm": 0.787828803062439, + "learning_rate": 6.246836083831123e-05, + "loss": 0.9589, + "step": 7424 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.7419188618659973, + "learning_rate": 6.246329857244104e-05, + "loss": 1.3605, + "step": 7425 + }, + { + "epoch": 2.2540597966307483, + "grad_norm": 0.6389353275299072, + "learning_rate": 6.245823630657083e-05, + "loss": 0.6711, + "step": 7426 + }, + { + "epoch": 2.2543633328274395, + "grad_norm": 0.5891460180282593, + "learning_rate": 6.245317404070063e-05, + "loss": 1.0786, + "step": 7427 + }, + { + "epoch": 2.254666869024131, + "grad_norm": 0.7612578868865967, + "learning_rate": 6.244811177483042e-05, + "loss": 1.4422, + "step": 7428 + }, + { + "epoch": 2.2549704052208224, + "grad_norm": 0.7443048357963562, + "learning_rate": 6.244304950896022e-05, + "loss": 1.3552, + "step": 7429 + }, + { + "epoch": 2.255273941417514, + "grad_norm": 0.8717397451400757, + "learning_rate": 6.243798724309001e-05, + "loss": 1.2675, + "step": 7430 + }, + { + "epoch": 2.2555774776142057, + "grad_norm": 0.6923880577087402, + "learning_rate": 6.24329249772198e-05, + "loss": 1.462, + "step": 7431 + }, + { + "epoch": 2.255881013810897, + "grad_norm": 0.6938219666481018, + "learning_rate": 6.24278627113496e-05, + "loss": 1.4014, + "step": 7432 + }, + { + "epoch": 2.2561845500075886, + "grad_norm": 0.700583815574646, + "learning_rate": 6.24228004454794e-05, + "loss": 1.3725, + "step": 7433 + }, + { + "epoch": 2.2564880862042798, + "grad_norm": 0.7408208847045898, + "learning_rate": 6.24177381796092e-05, + "loss": 0.7655, + "step": 7434 + }, + { + "epoch": 2.2567916224009714, + "grad_norm": 0.6944329142570496, + "learning_rate": 6.2412675913739e-05, + "loss": 1.5368, + "step": 7435 + }, + { + "epoch": 2.2570951585976626, + "grad_norm": 0.6069648861885071, + "learning_rate": 6.24076136478688e-05, + "loss": 0.6665, + "step": 7436 + }, + { + "epoch": 2.2573986947943543, + "grad_norm": 0.6946176290512085, + "learning_rate": 6.240255138199859e-05, + "loss": 1.5272, + "step": 7437 + }, + { + "epoch": 2.2577022309910455, + "grad_norm": 1.039425015449524, + "learning_rate": 6.239748911612838e-05, + "loss": 1.4667, + "step": 7438 + }, + { + "epoch": 2.258005767187737, + "grad_norm": 0.8364284634590149, + "learning_rate": 6.239242685025818e-05, + "loss": 1.2519, + "step": 7439 + }, + { + "epoch": 2.2583093033844284, + "grad_norm": 0.9233106374740601, + "learning_rate": 6.238736458438797e-05, + "loss": 1.3155, + "step": 7440 + }, + { + "epoch": 2.25861283958112, + "grad_norm": 0.5708487033843994, + "learning_rate": 6.238230231851777e-05, + "loss": 1.4757, + "step": 7441 + }, + { + "epoch": 2.2589163757778117, + "grad_norm": 0.7805628776550293, + "learning_rate": 6.237724005264756e-05, + "loss": 1.2103, + "step": 7442 + }, + { + "epoch": 2.259219911974503, + "grad_norm": 0.699892520904541, + "learning_rate": 6.237217778677736e-05, + "loss": 1.0468, + "step": 7443 + }, + { + "epoch": 2.2595234481711945, + "grad_norm": 0.6304746270179749, + "learning_rate": 6.236711552090717e-05, + "loss": 0.9349, + "step": 7444 + }, + { + "epoch": 2.2598269843678858, + "grad_norm": 0.7185422778129578, + "learning_rate": 6.236205325503696e-05, + "loss": 1.4341, + "step": 7445 + }, + { + "epoch": 2.2601305205645774, + "grad_norm": 0.56112140417099, + "learning_rate": 6.235699098916676e-05, + "loss": 1.1936, + "step": 7446 + }, + { + "epoch": 2.2604340567612686, + "grad_norm": 0.6657426357269287, + "learning_rate": 6.235192872329655e-05, + "loss": 1.1362, + "step": 7447 + }, + { + "epoch": 2.2607375929579603, + "grad_norm": 0.4865644872188568, + "learning_rate": 6.234686645742634e-05, + "loss": 0.6632, + "step": 7448 + }, + { + "epoch": 2.261041129154652, + "grad_norm": 0.8862194418907166, + "learning_rate": 6.234180419155614e-05, + "loss": 1.2728, + "step": 7449 + }, + { + "epoch": 2.261344665351343, + "grad_norm": 0.8285862803459167, + "learning_rate": 6.233674192568593e-05, + "loss": 1.1827, + "step": 7450 + }, + { + "epoch": 2.2616482015480344, + "grad_norm": 0.821243941783905, + "learning_rate": 6.233167965981573e-05, + "loss": 1.0182, + "step": 7451 + }, + { + "epoch": 2.261951737744726, + "grad_norm": 0.7204463481903076, + "learning_rate": 6.232661739394552e-05, + "loss": 1.277, + "step": 7452 + }, + { + "epoch": 2.2622552739414177, + "grad_norm": 0.880133867263794, + "learning_rate": 6.232155512807533e-05, + "loss": 1.6253, + "step": 7453 + }, + { + "epoch": 2.262558810138109, + "grad_norm": 0.6118894815444946, + "learning_rate": 6.231649286220513e-05, + "loss": 1.3237, + "step": 7454 + }, + { + "epoch": 2.2628623463348005, + "grad_norm": 0.863484799861908, + "learning_rate": 6.231143059633492e-05, + "loss": 0.9433, + "step": 7455 + }, + { + "epoch": 2.2631658825314918, + "grad_norm": 0.6867091059684753, + "learning_rate": 6.230636833046472e-05, + "loss": 1.393, + "step": 7456 + }, + { + "epoch": 2.2634694187281834, + "grad_norm": 0.7559130787849426, + "learning_rate": 6.230130606459451e-05, + "loss": 1.4322, + "step": 7457 + }, + { + "epoch": 2.2637729549248746, + "grad_norm": 0.7188666462898254, + "learning_rate": 6.22962437987243e-05, + "loss": 1.1943, + "step": 7458 + }, + { + "epoch": 2.2640764911215663, + "grad_norm": 0.6434985399246216, + "learning_rate": 6.22911815328541e-05, + "loss": 1.4742, + "step": 7459 + }, + { + "epoch": 2.264380027318258, + "grad_norm": 0.8641550540924072, + "learning_rate": 6.22861192669839e-05, + "loss": 1.0067, + "step": 7460 + }, + { + "epoch": 2.264683563514949, + "grad_norm": 0.816226065158844, + "learning_rate": 6.228105700111369e-05, + "loss": 1.2797, + "step": 7461 + }, + { + "epoch": 2.264987099711641, + "grad_norm": 0.9310057163238525, + "learning_rate": 6.22759947352435e-05, + "loss": 0.9412, + "step": 7462 + }, + { + "epoch": 2.265290635908332, + "grad_norm": 0.7045341730117798, + "learning_rate": 6.22709324693733e-05, + "loss": 1.1715, + "step": 7463 + }, + { + "epoch": 2.2655941721050237, + "grad_norm": 0.6052013635635376, + "learning_rate": 6.22658702035031e-05, + "loss": 1.8839, + "step": 7464 + }, + { + "epoch": 2.265897708301715, + "grad_norm": 0.6659092307090759, + "learning_rate": 6.22608079376329e-05, + "loss": 1.6587, + "step": 7465 + }, + { + "epoch": 2.2662012444984065, + "grad_norm": 0.9298098087310791, + "learning_rate": 6.225574567176269e-05, + "loss": 0.8736, + "step": 7466 + }, + { + "epoch": 2.2665047806950978, + "grad_norm": 0.5785535573959351, + "learning_rate": 6.225068340589249e-05, + "loss": 1.4056, + "step": 7467 + }, + { + "epoch": 2.2668083168917894, + "grad_norm": 0.7402830719947815, + "learning_rate": 6.224562114002228e-05, + "loss": 1.4276, + "step": 7468 + }, + { + "epoch": 2.2671118530884806, + "grad_norm": 0.7167106866836548, + "learning_rate": 6.224055887415208e-05, + "loss": 1.2924, + "step": 7469 + }, + { + "epoch": 2.2674153892851723, + "grad_norm": 0.783779501914978, + "learning_rate": 6.223549660828187e-05, + "loss": 1.4738, + "step": 7470 + }, + { + "epoch": 2.267718925481864, + "grad_norm": 0.516488790512085, + "learning_rate": 6.223043434241167e-05, + "loss": 1.1022, + "step": 7471 + }, + { + "epoch": 2.268022461678555, + "grad_norm": 0.6242533326148987, + "learning_rate": 6.222537207654146e-05, + "loss": 1.0569, + "step": 7472 + }, + { + "epoch": 2.268325997875247, + "grad_norm": 0.7155179977416992, + "learning_rate": 6.222030981067127e-05, + "loss": 1.4912, + "step": 7473 + }, + { + "epoch": 2.268629534071938, + "grad_norm": 0.6795659065246582, + "learning_rate": 6.221524754480106e-05, + "loss": 0.9815, + "step": 7474 + }, + { + "epoch": 2.2689330702686297, + "grad_norm": 0.8595495820045471, + "learning_rate": 6.221018527893086e-05, + "loss": 1.4531, + "step": 7475 + }, + { + "epoch": 2.269236606465321, + "grad_norm": 0.7935011386871338, + "learning_rate": 6.220512301306065e-05, + "loss": 1.347, + "step": 7476 + }, + { + "epoch": 2.2695401426620125, + "grad_norm": 0.8852940201759338, + "learning_rate": 6.220006074719045e-05, + "loss": 1.4368, + "step": 7477 + }, + { + "epoch": 2.2698436788587038, + "grad_norm": 0.625978410243988, + "learning_rate": 6.219499848132024e-05, + "loss": 1.0929, + "step": 7478 + }, + { + "epoch": 2.2701472150553954, + "grad_norm": 0.7276142835617065, + "learning_rate": 6.218993621545004e-05, + "loss": 0.6517, + "step": 7479 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.8396570682525635, + "learning_rate": 6.218487394957983e-05, + "loss": 1.2783, + "step": 7480 + }, + { + "epoch": 2.2707542874487783, + "grad_norm": 0.6671884655952454, + "learning_rate": 6.217981168370963e-05, + "loss": 1.2476, + "step": 7481 + }, + { + "epoch": 2.27105782364547, + "grad_norm": 0.7303450107574463, + "learning_rate": 6.217474941783942e-05, + "loss": 1.0739, + "step": 7482 + }, + { + "epoch": 2.271361359842161, + "grad_norm": 0.7783435583114624, + "learning_rate": 6.216968715196923e-05, + "loss": 1.418, + "step": 7483 + }, + { + "epoch": 2.271664896038853, + "grad_norm": 0.6920360326766968, + "learning_rate": 6.216462488609903e-05, + "loss": 1.4488, + "step": 7484 + }, + { + "epoch": 2.271968432235544, + "grad_norm": 0.5489271879196167, + "learning_rate": 6.215956262022882e-05, + "loss": 1.6853, + "step": 7485 + }, + { + "epoch": 2.2722719684322357, + "grad_norm": 0.8495591282844543, + "learning_rate": 6.215450035435861e-05, + "loss": 1.6002, + "step": 7486 + }, + { + "epoch": 2.272575504628927, + "grad_norm": 0.7131351232528687, + "learning_rate": 6.214943808848841e-05, + "loss": 1.1706, + "step": 7487 + }, + { + "epoch": 2.2728790408256185, + "grad_norm": 0.9178603291511536, + "learning_rate": 6.21443758226182e-05, + "loss": 1.368, + "step": 7488 + }, + { + "epoch": 2.2731825770223097, + "grad_norm": 0.7738288640975952, + "learning_rate": 6.2139313556748e-05, + "loss": 1.0531, + "step": 7489 + }, + { + "epoch": 2.2734861132190014, + "grad_norm": 0.6220824122428894, + "learning_rate": 6.21342512908778e-05, + "loss": 0.8149, + "step": 7490 + }, + { + "epoch": 2.2737896494156926, + "grad_norm": 0.5784722566604614, + "learning_rate": 6.212918902500759e-05, + "loss": 0.9504, + "step": 7491 + }, + { + "epoch": 2.2740931856123843, + "grad_norm": 0.9277094006538391, + "learning_rate": 6.21241267591374e-05, + "loss": 1.1672, + "step": 7492 + }, + { + "epoch": 2.274396721809076, + "grad_norm": 0.8855558037757874, + "learning_rate": 6.211906449326719e-05, + "loss": 1.2628, + "step": 7493 + }, + { + "epoch": 2.274700258005767, + "grad_norm": 0.7635779976844788, + "learning_rate": 6.211400222739699e-05, + "loss": 1.3246, + "step": 7494 + }, + { + "epoch": 2.275003794202459, + "grad_norm": 0.6834839582443237, + "learning_rate": 6.210893996152678e-05, + "loss": 1.2209, + "step": 7495 + }, + { + "epoch": 2.27530733039915, + "grad_norm": 0.6764331459999084, + "learning_rate": 6.210387769565658e-05, + "loss": 1.6269, + "step": 7496 + }, + { + "epoch": 2.2756108665958417, + "grad_norm": 0.8024922013282776, + "learning_rate": 6.209881542978637e-05, + "loss": 1.3489, + "step": 7497 + }, + { + "epoch": 2.275914402792533, + "grad_norm": 0.7418940663337708, + "learning_rate": 6.209375316391617e-05, + "loss": 0.8103, + "step": 7498 + }, + { + "epoch": 2.2762179389892245, + "grad_norm": 0.9875561594963074, + "learning_rate": 6.208869089804596e-05, + "loss": 0.8744, + "step": 7499 + }, + { + "epoch": 2.2765214751859157, + "grad_norm": 0.8351328372955322, + "learning_rate": 6.208362863217576e-05, + "loss": 1.4052, + "step": 7500 + }, + { + "epoch": 2.2768250113826074, + "grad_norm": 0.7316960692405701, + "learning_rate": 6.207856636630556e-05, + "loss": 1.3071, + "step": 7501 + }, + { + "epoch": 2.2771285475792986, + "grad_norm": 0.7554438710212708, + "learning_rate": 6.207350410043536e-05, + "loss": 0.9889, + "step": 7502 + }, + { + "epoch": 2.2774320837759903, + "grad_norm": 0.7924057841300964, + "learning_rate": 6.206844183456515e-05, + "loss": 1.225, + "step": 7503 + }, + { + "epoch": 2.277735619972682, + "grad_norm": 0.619770884513855, + "learning_rate": 6.206337956869495e-05, + "loss": 1.5899, + "step": 7504 + }, + { + "epoch": 2.278039156169373, + "grad_norm": 0.8481298685073853, + "learning_rate": 6.205831730282474e-05, + "loss": 1.2686, + "step": 7505 + }, + { + "epoch": 2.278342692366065, + "grad_norm": 0.7642682790756226, + "learning_rate": 6.205325503695454e-05, + "loss": 1.0228, + "step": 7506 + }, + { + "epoch": 2.278646228562756, + "grad_norm": 0.712116003036499, + "learning_rate": 6.204819277108435e-05, + "loss": 1.3615, + "step": 7507 + }, + { + "epoch": 2.2789497647594477, + "grad_norm": 0.733302891254425, + "learning_rate": 6.204313050521414e-05, + "loss": 1.2985, + "step": 7508 + }, + { + "epoch": 2.279253300956139, + "grad_norm": 0.8620637655258179, + "learning_rate": 6.203806823934394e-05, + "loss": 1.291, + "step": 7509 + }, + { + "epoch": 2.2795568371528305, + "grad_norm": 0.7499003410339355, + "learning_rate": 6.203300597347373e-05, + "loss": 1.2764, + "step": 7510 + }, + { + "epoch": 2.2798603733495217, + "grad_norm": 0.8012336492538452, + "learning_rate": 6.202794370760353e-05, + "loss": 0.9868, + "step": 7511 + }, + { + "epoch": 2.2801639095462134, + "grad_norm": 0.9195525646209717, + "learning_rate": 6.202288144173333e-05, + "loss": 1.4727, + "step": 7512 + }, + { + "epoch": 2.2804674457429046, + "grad_norm": 0.8243017196655273, + "learning_rate": 6.201781917586313e-05, + "loss": 1.1061, + "step": 7513 + }, + { + "epoch": 2.2807709819395963, + "grad_norm": 0.8141716718673706, + "learning_rate": 6.201275690999292e-05, + "loss": 1.5759, + "step": 7514 + }, + { + "epoch": 2.281074518136288, + "grad_norm": 0.9679292440414429, + "learning_rate": 6.200769464412272e-05, + "loss": 1.1664, + "step": 7515 + }, + { + "epoch": 2.281378054332979, + "grad_norm": 0.8407115340232849, + "learning_rate": 6.200263237825251e-05, + "loss": 1.3996, + "step": 7516 + }, + { + "epoch": 2.281681590529671, + "grad_norm": 0.8369561433792114, + "learning_rate": 6.199757011238231e-05, + "loss": 1.3342, + "step": 7517 + }, + { + "epoch": 2.281985126726362, + "grad_norm": 0.6965354084968567, + "learning_rate": 6.19925078465121e-05, + "loss": 0.7063, + "step": 7518 + }, + { + "epoch": 2.2822886629230537, + "grad_norm": 0.8770589232444763, + "learning_rate": 6.19874455806419e-05, + "loss": 1.28, + "step": 7519 + }, + { + "epoch": 2.282592199119745, + "grad_norm": 0.821842610836029, + "learning_rate": 6.198238331477169e-05, + "loss": 1.1275, + "step": 7520 + }, + { + "epoch": 2.2828957353164365, + "grad_norm": 0.828779935836792, + "learning_rate": 6.197732104890149e-05, + "loss": 1.3829, + "step": 7521 + }, + { + "epoch": 2.283199271513128, + "grad_norm": 0.7814561128616333, + "learning_rate": 6.19722587830313e-05, + "loss": 1.5446, + "step": 7522 + }, + { + "epoch": 2.2835028077098194, + "grad_norm": 0.7407017350196838, + "learning_rate": 6.196719651716109e-05, + "loss": 1.4936, + "step": 7523 + }, + { + "epoch": 2.2838063439065106, + "grad_norm": 0.7612883448600769, + "learning_rate": 6.196213425129088e-05, + "loss": 1.4638, + "step": 7524 + }, + { + "epoch": 2.2841098801032023, + "grad_norm": 0.6154638528823853, + "learning_rate": 6.195707198542068e-05, + "loss": 1.8089, + "step": 7525 + }, + { + "epoch": 2.284413416299894, + "grad_norm": 0.9546413421630859, + "learning_rate": 6.195200971955047e-05, + "loss": 1.5108, + "step": 7526 + }, + { + "epoch": 2.284716952496585, + "grad_norm": 0.7517131567001343, + "learning_rate": 6.194694745368027e-05, + "loss": 1.0738, + "step": 7527 + }, + { + "epoch": 2.285020488693277, + "grad_norm": 0.7600359320640564, + "learning_rate": 6.194188518781006e-05, + "loss": 1.5688, + "step": 7528 + }, + { + "epoch": 2.285324024889968, + "grad_norm": 0.7792675495147705, + "learning_rate": 6.193682292193986e-05, + "loss": 0.9124, + "step": 7529 + }, + { + "epoch": 2.2856275610866597, + "grad_norm": 0.5790246725082397, + "learning_rate": 6.193176065606965e-05, + "loss": 1.0978, + "step": 7530 + }, + { + "epoch": 2.285931097283351, + "grad_norm": 0.7425166368484497, + "learning_rate": 6.192669839019946e-05, + "loss": 1.1871, + "step": 7531 + }, + { + "epoch": 2.2862346334800425, + "grad_norm": 0.6829063296318054, + "learning_rate": 6.192163612432926e-05, + "loss": 1.3564, + "step": 7532 + }, + { + "epoch": 2.286538169676734, + "grad_norm": 0.8501928448677063, + "learning_rate": 6.191657385845905e-05, + "loss": 1.0968, + "step": 7533 + }, + { + "epoch": 2.2868417058734254, + "grad_norm": 0.7157674431800842, + "learning_rate": 6.191151159258885e-05, + "loss": 1.4422, + "step": 7534 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.6410207152366638, + "learning_rate": 6.190644932671864e-05, + "loss": 1.8158, + "step": 7535 + }, + { + "epoch": 2.2874487782668083, + "grad_norm": 0.7278507947921753, + "learning_rate": 6.190138706084844e-05, + "loss": 1.0099, + "step": 7536 + }, + { + "epoch": 2.2877523144635, + "grad_norm": 0.6953244805335999, + "learning_rate": 6.189632479497823e-05, + "loss": 1.3812, + "step": 7537 + }, + { + "epoch": 2.288055850660191, + "grad_norm": 0.7524649500846863, + "learning_rate": 6.189126252910803e-05, + "loss": 1.3067, + "step": 7538 + }, + { + "epoch": 2.288359386856883, + "grad_norm": 0.8401016592979431, + "learning_rate": 6.188620026323782e-05, + "loss": 0.9991, + "step": 7539 + }, + { + "epoch": 2.288662923053574, + "grad_norm": 0.8146904110908508, + "learning_rate": 6.188113799736763e-05, + "loss": 0.8624, + "step": 7540 + }, + { + "epoch": 2.2889664592502657, + "grad_norm": 0.815436601638794, + "learning_rate": 6.187607573149742e-05, + "loss": 1.2165, + "step": 7541 + }, + { + "epoch": 2.289269995446957, + "grad_norm": 0.817846953868866, + "learning_rate": 6.187101346562722e-05, + "loss": 1.546, + "step": 7542 + }, + { + "epoch": 2.2895735316436485, + "grad_norm": 0.7438780665397644, + "learning_rate": 6.186595119975701e-05, + "loss": 1.14, + "step": 7543 + }, + { + "epoch": 2.28987706784034, + "grad_norm": 0.7665032148361206, + "learning_rate": 6.186088893388681e-05, + "loss": 1.4702, + "step": 7544 + }, + { + "epoch": 2.2901806040370314, + "grad_norm": 0.736453115940094, + "learning_rate": 6.18558266680166e-05, + "loss": 1.2116, + "step": 7545 + }, + { + "epoch": 2.290484140233723, + "grad_norm": 0.8457592129707336, + "learning_rate": 6.18507644021464e-05, + "loss": 1.7704, + "step": 7546 + }, + { + "epoch": 2.2907876764304143, + "grad_norm": 0.7538440823554993, + "learning_rate": 6.184570213627619e-05, + "loss": 1.2569, + "step": 7547 + }, + { + "epoch": 2.291091212627106, + "grad_norm": 0.7179580926895142, + "learning_rate": 6.184063987040599e-05, + "loss": 1.3755, + "step": 7548 + }, + { + "epoch": 2.291394748823797, + "grad_norm": 0.7420378923416138, + "learning_rate": 6.183557760453578e-05, + "loss": 1.4729, + "step": 7549 + }, + { + "epoch": 2.291698285020489, + "grad_norm": 0.789522647857666, + "learning_rate": 6.183051533866559e-05, + "loss": 1.383, + "step": 7550 + }, + { + "epoch": 2.29200182121718, + "grad_norm": 0.7265890836715698, + "learning_rate": 6.182545307279538e-05, + "loss": 1.5371, + "step": 7551 + }, + { + "epoch": 2.2923053574138716, + "grad_norm": 0.7750800251960754, + "learning_rate": 6.182039080692519e-05, + "loss": 1.3417, + "step": 7552 + }, + { + "epoch": 2.292608893610563, + "grad_norm": 0.8087584972381592, + "learning_rate": 6.181532854105499e-05, + "loss": 1.2937, + "step": 7553 + }, + { + "epoch": 2.2929124298072545, + "grad_norm": 1.0158443450927734, + "learning_rate": 6.181026627518478e-05, + "loss": 1.2413, + "step": 7554 + }, + { + "epoch": 2.293215966003946, + "grad_norm": 0.7395852208137512, + "learning_rate": 6.180520400931458e-05, + "loss": 1.3702, + "step": 7555 + }, + { + "epoch": 2.2935195022006374, + "grad_norm": 0.8146117925643921, + "learning_rate": 6.180014174344437e-05, + "loss": 0.7595, + "step": 7556 + }, + { + "epoch": 2.293823038397329, + "grad_norm": 0.8666897416114807, + "learning_rate": 6.179507947757417e-05, + "loss": 1.5115, + "step": 7557 + }, + { + "epoch": 2.2941265745940203, + "grad_norm": 0.8771815299987793, + "learning_rate": 6.179001721170396e-05, + "loss": 1.4715, + "step": 7558 + }, + { + "epoch": 2.294430110790712, + "grad_norm": 0.810228168964386, + "learning_rate": 6.178495494583376e-05, + "loss": 1.4295, + "step": 7559 + }, + { + "epoch": 2.294733646987403, + "grad_norm": 0.6954602003097534, + "learning_rate": 6.177989267996355e-05, + "loss": 1.2752, + "step": 7560 + }, + { + "epoch": 2.2950371831840948, + "grad_norm": 0.7767638564109802, + "learning_rate": 6.177483041409336e-05, + "loss": 1.2908, + "step": 7561 + }, + { + "epoch": 2.295340719380786, + "grad_norm": 0.6568270921707153, + "learning_rate": 6.176976814822315e-05, + "loss": 0.9384, + "step": 7562 + }, + { + "epoch": 2.2956442555774776, + "grad_norm": 0.7226028442382812, + "learning_rate": 6.176470588235295e-05, + "loss": 1.1192, + "step": 7563 + }, + { + "epoch": 2.295947791774169, + "grad_norm": 0.685576319694519, + "learning_rate": 6.175964361648274e-05, + "loss": 1.1595, + "step": 7564 + }, + { + "epoch": 2.2962513279708605, + "grad_norm": 0.4528180658817291, + "learning_rate": 6.175458135061254e-05, + "loss": 1.6221, + "step": 7565 + }, + { + "epoch": 2.296554864167552, + "grad_norm": 0.6942249536514282, + "learning_rate": 6.174951908474233e-05, + "loss": 1.1044, + "step": 7566 + }, + { + "epoch": 2.2968584003642434, + "grad_norm": 0.7307713031768799, + "learning_rate": 6.174445681887213e-05, + "loss": 1.3832, + "step": 7567 + }, + { + "epoch": 2.297161936560935, + "grad_norm": 0.6575482487678528, + "learning_rate": 6.173939455300192e-05, + "loss": 1.4703, + "step": 7568 + }, + { + "epoch": 2.2974654727576262, + "grad_norm": 0.8544819355010986, + "learning_rate": 6.173433228713172e-05, + "loss": 1.3402, + "step": 7569 + }, + { + "epoch": 2.297769008954318, + "grad_norm": 0.7268744111061096, + "learning_rate": 6.172927002126153e-05, + "loss": 1.6343, + "step": 7570 + }, + { + "epoch": 2.298072545151009, + "grad_norm": 0.6348064541816711, + "learning_rate": 6.172420775539132e-05, + "loss": 1.3316, + "step": 7571 + }, + { + "epoch": 2.2983760813477008, + "grad_norm": 0.7735495567321777, + "learning_rate": 6.171914548952112e-05, + "loss": 1.4615, + "step": 7572 + }, + { + "epoch": 2.298679617544392, + "grad_norm": 0.8103759288787842, + "learning_rate": 6.171408322365091e-05, + "loss": 1.5418, + "step": 7573 + }, + { + "epoch": 2.2989831537410836, + "grad_norm": 0.7252805233001709, + "learning_rate": 6.17090209577807e-05, + "loss": 0.8511, + "step": 7574 + }, + { + "epoch": 2.299286689937775, + "grad_norm": 0.8680673241615295, + "learning_rate": 6.17039586919105e-05, + "loss": 1.3099, + "step": 7575 + }, + { + "epoch": 2.2995902261344665, + "grad_norm": 0.5763619542121887, + "learning_rate": 6.16988964260403e-05, + "loss": 1.7721, + "step": 7576 + }, + { + "epoch": 2.299893762331158, + "grad_norm": 0.7439877986907959, + "learning_rate": 6.169383416017009e-05, + "loss": 1.1515, + "step": 7577 + }, + { + "epoch": 2.3001972985278494, + "grad_norm": 0.7698675990104675, + "learning_rate": 6.168877189429988e-05, + "loss": 1.2372, + "step": 7578 + }, + { + "epoch": 2.300500834724541, + "grad_norm": 0.7722331285476685, + "learning_rate": 6.168370962842969e-05, + "loss": 1.6233, + "step": 7579 + }, + { + "epoch": 2.3008043709212322, + "grad_norm": 0.8871555924415588, + "learning_rate": 6.167864736255949e-05, + "loss": 1.4354, + "step": 7580 + }, + { + "epoch": 2.301107907117924, + "grad_norm": 0.8712011575698853, + "learning_rate": 6.167358509668928e-05, + "loss": 1.1775, + "step": 7581 + }, + { + "epoch": 2.301411443314615, + "grad_norm": 0.9085456728935242, + "learning_rate": 6.166852283081908e-05, + "loss": 1.4472, + "step": 7582 + }, + { + "epoch": 2.3017149795113068, + "grad_norm": 0.8338646292686462, + "learning_rate": 6.166346056494887e-05, + "loss": 1.2795, + "step": 7583 + }, + { + "epoch": 2.3020185157079984, + "grad_norm": 0.6676700711250305, + "learning_rate": 6.165839829907867e-05, + "loss": 1.4605, + "step": 7584 + }, + { + "epoch": 2.3023220519046896, + "grad_norm": 0.8141628503799438, + "learning_rate": 6.165333603320846e-05, + "loss": 1.3307, + "step": 7585 + }, + { + "epoch": 2.302625588101381, + "grad_norm": 0.6280063986778259, + "learning_rate": 6.164827376733826e-05, + "loss": 1.3559, + "step": 7586 + }, + { + "epoch": 2.3029291242980725, + "grad_norm": 1.0572842359542847, + "learning_rate": 6.164321150146805e-05, + "loss": 0.9411, + "step": 7587 + }, + { + "epoch": 2.303232660494764, + "grad_norm": 0.7565344572067261, + "learning_rate": 6.163814923559785e-05, + "loss": 1.1205, + "step": 7588 + }, + { + "epoch": 2.3035361966914554, + "grad_norm": 0.9521320462226868, + "learning_rate": 6.163308696972765e-05, + "loss": 1.1303, + "step": 7589 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.8380699157714844, + "learning_rate": 6.162802470385745e-05, + "loss": 1.1181, + "step": 7590 + }, + { + "epoch": 2.3041432690848382, + "grad_norm": 0.7872660756111145, + "learning_rate": 6.162296243798724e-05, + "loss": 1.3316, + "step": 7591 + }, + { + "epoch": 2.30444680528153, + "grad_norm": 0.6858444213867188, + "learning_rate": 6.161790017211704e-05, + "loss": 1.2919, + "step": 7592 + }, + { + "epoch": 2.304750341478221, + "grad_norm": 0.7087762951850891, + "learning_rate": 6.161283790624683e-05, + "loss": 1.4641, + "step": 7593 + }, + { + "epoch": 2.3050538776749128, + "grad_norm": 0.7477747201919556, + "learning_rate": 6.160777564037663e-05, + "loss": 1.2525, + "step": 7594 + }, + { + "epoch": 2.3053574138716044, + "grad_norm": 0.8030606508255005, + "learning_rate": 6.160271337450642e-05, + "loss": 1.4686, + "step": 7595 + }, + { + "epoch": 2.3056609500682956, + "grad_norm": 0.7098719477653503, + "learning_rate": 6.159765110863623e-05, + "loss": 1.747, + "step": 7596 + }, + { + "epoch": 2.3059644862649873, + "grad_norm": 0.7161492109298706, + "learning_rate": 6.159258884276603e-05, + "loss": 1.5472, + "step": 7597 + }, + { + "epoch": 2.3062680224616785, + "grad_norm": 0.6479907631874084, + "learning_rate": 6.158752657689582e-05, + "loss": 1.3357, + "step": 7598 + }, + { + "epoch": 2.30657155865837, + "grad_norm": 0.97112637758255, + "learning_rate": 6.158246431102562e-05, + "loss": 1.2842, + "step": 7599 + }, + { + "epoch": 2.3068750948550614, + "grad_norm": 0.8729383945465088, + "learning_rate": 6.157740204515542e-05, + "loss": 1.2892, + "step": 7600 + }, + { + "epoch": 2.307178631051753, + "grad_norm": 0.7437787652015686, + "learning_rate": 6.157233977928522e-05, + "loss": 1.4824, + "step": 7601 + }, + { + "epoch": 2.3074821672484442, + "grad_norm": 0.7232855558395386, + "learning_rate": 6.156727751341501e-05, + "loss": 0.8485, + "step": 7602 + }, + { + "epoch": 2.307785703445136, + "grad_norm": 0.800493061542511, + "learning_rate": 6.156221524754481e-05, + "loss": 1.572, + "step": 7603 + }, + { + "epoch": 2.308089239641827, + "grad_norm": 0.9014605283737183, + "learning_rate": 6.15571529816746e-05, + "loss": 1.2663, + "step": 7604 + }, + { + "epoch": 2.3083927758385188, + "grad_norm": 1.083298921585083, + "learning_rate": 6.15520907158044e-05, + "loss": 1.5554, + "step": 7605 + }, + { + "epoch": 2.3086963120352104, + "grad_norm": 0.695650041103363, + "learning_rate": 6.15470284499342e-05, + "loss": 1.4479, + "step": 7606 + }, + { + "epoch": 2.3089998482319016, + "grad_norm": 0.7614145874977112, + "learning_rate": 6.154196618406399e-05, + "loss": 1.31, + "step": 7607 + }, + { + "epoch": 2.3093033844285933, + "grad_norm": 0.8237013220787048, + "learning_rate": 6.153690391819378e-05, + "loss": 1.0565, + "step": 7608 + }, + { + "epoch": 2.3096069206252845, + "grad_norm": 0.8441504240036011, + "learning_rate": 6.153184165232359e-05, + "loss": 1.3506, + "step": 7609 + }, + { + "epoch": 2.309910456821976, + "grad_norm": 0.8419016599655151, + "learning_rate": 6.152677938645339e-05, + "loss": 1.3949, + "step": 7610 + }, + { + "epoch": 2.3102139930186674, + "grad_norm": 0.7505916953086853, + "learning_rate": 6.152171712058318e-05, + "loss": 1.07, + "step": 7611 + }, + { + "epoch": 2.310517529215359, + "grad_norm": 0.7543771266937256, + "learning_rate": 6.151665485471298e-05, + "loss": 1.6368, + "step": 7612 + }, + { + "epoch": 2.3108210654120502, + "grad_norm": 0.7415176033973694, + "learning_rate": 6.151159258884277e-05, + "loss": 1.3673, + "step": 7613 + }, + { + "epoch": 2.311124601608742, + "grad_norm": 0.7070369720458984, + "learning_rate": 6.150653032297257e-05, + "loss": 1.4192, + "step": 7614 + }, + { + "epoch": 2.311428137805433, + "grad_norm": 0.7339876294136047, + "learning_rate": 6.150146805710236e-05, + "loss": 1.4539, + "step": 7615 + }, + { + "epoch": 2.3117316740021248, + "grad_norm": 0.8350193500518799, + "learning_rate": 6.149640579123215e-05, + "loss": 1.3213, + "step": 7616 + }, + { + "epoch": 2.3120352101988164, + "grad_norm": 0.894660234451294, + "learning_rate": 6.149134352536195e-05, + "loss": 0.9378, + "step": 7617 + }, + { + "epoch": 2.3123387463955076, + "grad_norm": 0.7360272407531738, + "learning_rate": 6.148628125949176e-05, + "loss": 1.4664, + "step": 7618 + }, + { + "epoch": 2.3126422825921993, + "grad_norm": 0.735359787940979, + "learning_rate": 6.148121899362155e-05, + "loss": 1.548, + "step": 7619 + }, + { + "epoch": 2.3129458187888905, + "grad_norm": 0.6220079064369202, + "learning_rate": 6.147615672775135e-05, + "loss": 1.5925, + "step": 7620 + }, + { + "epoch": 2.313249354985582, + "grad_norm": 0.6058359146118164, + "learning_rate": 6.147109446188114e-05, + "loss": 1.094, + "step": 7621 + }, + { + "epoch": 2.3135528911822734, + "grad_norm": 0.7334878444671631, + "learning_rate": 6.146603219601094e-05, + "loss": 1.3287, + "step": 7622 + }, + { + "epoch": 2.313856427378965, + "grad_norm": 0.7956668138504028, + "learning_rate": 6.146096993014073e-05, + "loss": 1.3067, + "step": 7623 + }, + { + "epoch": 2.3141599635756562, + "grad_norm": 0.8465598821640015, + "learning_rate": 6.145590766427053e-05, + "loss": 1.3953, + "step": 7624 + }, + { + "epoch": 2.314463499772348, + "grad_norm": 0.936220645904541, + "learning_rate": 6.145084539840032e-05, + "loss": 1.4161, + "step": 7625 + }, + { + "epoch": 2.314767035969039, + "grad_norm": 0.5641535520553589, + "learning_rate": 6.144578313253012e-05, + "loss": 1.1868, + "step": 7626 + }, + { + "epoch": 2.3150705721657308, + "grad_norm": 0.7781814932823181, + "learning_rate": 6.144072086665991e-05, + "loss": 1.3219, + "step": 7627 + }, + { + "epoch": 2.3153741083624224, + "grad_norm": 0.7027815580368042, + "learning_rate": 6.143565860078972e-05, + "loss": 1.4012, + "step": 7628 + }, + { + "epoch": 2.3156776445591136, + "grad_norm": 0.8026196956634521, + "learning_rate": 6.143059633491951e-05, + "loss": 1.221, + "step": 7629 + }, + { + "epoch": 2.3159811807558053, + "grad_norm": 0.8572709560394287, + "learning_rate": 6.142553406904931e-05, + "loss": 1.0766, + "step": 7630 + }, + { + "epoch": 2.3162847169524965, + "grad_norm": 0.6600764393806458, + "learning_rate": 6.14204718031791e-05, + "loss": 1.7696, + "step": 7631 + }, + { + "epoch": 2.316588253149188, + "grad_norm": 0.80988609790802, + "learning_rate": 6.14154095373089e-05, + "loss": 1.3428, + "step": 7632 + }, + { + "epoch": 2.3168917893458794, + "grad_norm": 0.6442969441413879, + "learning_rate": 6.14103472714387e-05, + "loss": 1.6526, + "step": 7633 + }, + { + "epoch": 2.317195325542571, + "grad_norm": 0.7561648488044739, + "learning_rate": 6.140528500556849e-05, + "loss": 1.661, + "step": 7634 + }, + { + "epoch": 2.3174988617392622, + "grad_norm": 0.8251952528953552, + "learning_rate": 6.140022273969828e-05, + "loss": 1.5765, + "step": 7635 + }, + { + "epoch": 2.317802397935954, + "grad_norm": 0.7034004330635071, + "learning_rate": 6.139516047382808e-05, + "loss": 0.896, + "step": 7636 + }, + { + "epoch": 2.318105934132645, + "grad_norm": 0.713154137134552, + "learning_rate": 6.139009820795789e-05, + "loss": 1.534, + "step": 7637 + }, + { + "epoch": 2.3184094703293368, + "grad_norm": 0.8786371350288391, + "learning_rate": 6.138503594208768e-05, + "loss": 1.3748, + "step": 7638 + }, + { + "epoch": 2.3187130065260284, + "grad_norm": 0.6298970580101013, + "learning_rate": 6.137997367621748e-05, + "loss": 1.0846, + "step": 7639 + }, + { + "epoch": 2.3190165427227196, + "grad_norm": 0.732848048210144, + "learning_rate": 6.137491141034727e-05, + "loss": 1.1163, + "step": 7640 + }, + { + "epoch": 2.3193200789194113, + "grad_norm": 0.6818196177482605, + "learning_rate": 6.136984914447708e-05, + "loss": 1.7087, + "step": 7641 + }, + { + "epoch": 2.3196236151161025, + "grad_norm": 0.628974974155426, + "learning_rate": 6.136478687860687e-05, + "loss": 0.9459, + "step": 7642 + }, + { + "epoch": 2.319927151312794, + "grad_norm": 0.7217102646827698, + "learning_rate": 6.135972461273667e-05, + "loss": 1.0762, + "step": 7643 + }, + { + "epoch": 2.3202306875094854, + "grad_norm": 0.7135050892829895, + "learning_rate": 6.135466234686646e-05, + "loss": 1.3588, + "step": 7644 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.7265660166740417, + "learning_rate": 6.134960008099626e-05, + "loss": 1.601, + "step": 7645 + }, + { + "epoch": 2.3208377599028682, + "grad_norm": 0.7401967644691467, + "learning_rate": 6.134453781512605e-05, + "loss": 1.2171, + "step": 7646 + }, + { + "epoch": 2.32114129609956, + "grad_norm": 0.7953644394874573, + "learning_rate": 6.133947554925585e-05, + "loss": 1.5427, + "step": 7647 + }, + { + "epoch": 2.321444832296251, + "grad_norm": 0.86178058385849, + "learning_rate": 6.133441328338566e-05, + "loss": 0.9117, + "step": 7648 + }, + { + "epoch": 2.3217483684929427, + "grad_norm": 0.835649847984314, + "learning_rate": 6.132935101751545e-05, + "loss": 1.1636, + "step": 7649 + }, + { + "epoch": 2.3220519046896344, + "grad_norm": 0.866837739944458, + "learning_rate": 6.132428875164525e-05, + "loss": 1.4649, + "step": 7650 + }, + { + "epoch": 2.3223554408863256, + "grad_norm": 0.8551737070083618, + "learning_rate": 6.131922648577504e-05, + "loss": 1.4207, + "step": 7651 + }, + { + "epoch": 2.3226589770830173, + "grad_norm": 0.7347872257232666, + "learning_rate": 6.131416421990484e-05, + "loss": 1.118, + "step": 7652 + }, + { + "epoch": 2.3229625132797085, + "grad_norm": 0.682930052280426, + "learning_rate": 6.130910195403463e-05, + "loss": 1.6991, + "step": 7653 + }, + { + "epoch": 2.3232660494764, + "grad_norm": 0.8257922530174255, + "learning_rate": 6.130403968816442e-05, + "loss": 0.885, + "step": 7654 + }, + { + "epoch": 2.3235695856730914, + "grad_norm": 0.8472979068756104, + "learning_rate": 6.129897742229422e-05, + "loss": 0.9182, + "step": 7655 + }, + { + "epoch": 2.323873121869783, + "grad_norm": 0.8732231259346008, + "learning_rate": 6.129391515642401e-05, + "loss": 1.2871, + "step": 7656 + }, + { + "epoch": 2.3241766580664747, + "grad_norm": 0.7592588663101196, + "learning_rate": 6.128885289055382e-05, + "loss": 1.3911, + "step": 7657 + }, + { + "epoch": 2.324480194263166, + "grad_norm": 0.6937957406044006, + "learning_rate": 6.128379062468362e-05, + "loss": 0.6854, + "step": 7658 + }, + { + "epoch": 2.324783730459857, + "grad_norm": 0.5173711776733398, + "learning_rate": 6.127872835881341e-05, + "loss": 1.2109, + "step": 7659 + }, + { + "epoch": 2.3250872666565487, + "grad_norm": 0.711747407913208, + "learning_rate": 6.127366609294321e-05, + "loss": 1.1013, + "step": 7660 + }, + { + "epoch": 2.3253908028532404, + "grad_norm": 0.7678746581077576, + "learning_rate": 6.1268603827073e-05, + "loss": 1.7032, + "step": 7661 + }, + { + "epoch": 2.3256943390499316, + "grad_norm": 0.8963012099266052, + "learning_rate": 6.12635415612028e-05, + "loss": 1.585, + "step": 7662 + }, + { + "epoch": 2.3259978752466233, + "grad_norm": 0.730894148349762, + "learning_rate": 6.125847929533259e-05, + "loss": 0.9299, + "step": 7663 + }, + { + "epoch": 2.3263014114433145, + "grad_norm": 0.7582551836967468, + "learning_rate": 6.125341702946239e-05, + "loss": 1.0785, + "step": 7664 + }, + { + "epoch": 2.326604947640006, + "grad_norm": 0.7867629528045654, + "learning_rate": 6.124835476359218e-05, + "loss": 1.2304, + "step": 7665 + }, + { + "epoch": 2.3269084838366974, + "grad_norm": 0.9067895412445068, + "learning_rate": 6.124329249772198e-05, + "loss": 1.0494, + "step": 7666 + }, + { + "epoch": 2.327212020033389, + "grad_norm": 0.5455618500709534, + "learning_rate": 6.123823023185178e-05, + "loss": 0.5172, + "step": 7667 + }, + { + "epoch": 2.3275155562300807, + "grad_norm": 0.7320606112480164, + "learning_rate": 6.123316796598158e-05, + "loss": 1.1368, + "step": 7668 + }, + { + "epoch": 2.327819092426772, + "grad_norm": 0.8261992335319519, + "learning_rate": 6.122810570011137e-05, + "loss": 1.4457, + "step": 7669 + }, + { + "epoch": 2.3281226286234635, + "grad_norm": 0.7958891987800598, + "learning_rate": 6.122304343424117e-05, + "loss": 0.7067, + "step": 7670 + }, + { + "epoch": 2.3284261648201547, + "grad_norm": 0.8207949995994568, + "learning_rate": 6.121798116837096e-05, + "loss": 1.1606, + "step": 7671 + }, + { + "epoch": 2.3287297010168464, + "grad_norm": 0.8190235495567322, + "learning_rate": 6.121291890250076e-05, + "loss": 1.3621, + "step": 7672 + }, + { + "epoch": 2.3290332372135376, + "grad_norm": 0.7936198711395264, + "learning_rate": 6.120785663663055e-05, + "loss": 1.4129, + "step": 7673 + }, + { + "epoch": 2.3293367734102293, + "grad_norm": 0.7021920084953308, + "learning_rate": 6.120279437076035e-05, + "loss": 1.4672, + "step": 7674 + }, + { + "epoch": 2.3296403096069205, + "grad_norm": 0.8473578691482544, + "learning_rate": 6.119773210489014e-05, + "loss": 1.5384, + "step": 7675 + }, + { + "epoch": 2.329943845803612, + "grad_norm": 0.7380422949790955, + "learning_rate": 6.119266983901995e-05, + "loss": 1.4071, + "step": 7676 + }, + { + "epoch": 2.3302473820003033, + "grad_norm": 0.6773237586021423, + "learning_rate": 6.118760757314975e-05, + "loss": 1.0423, + "step": 7677 + }, + { + "epoch": 2.330550918196995, + "grad_norm": 0.8463999032974243, + "learning_rate": 6.118254530727954e-05, + "loss": 1.1341, + "step": 7678 + }, + { + "epoch": 2.3308544543936867, + "grad_norm": 0.8593174815177917, + "learning_rate": 6.117748304140934e-05, + "loss": 1.7154, + "step": 7679 + }, + { + "epoch": 2.331157990590378, + "grad_norm": 0.7568472623825073, + "learning_rate": 6.117242077553913e-05, + "loss": 0.9461, + "step": 7680 + }, + { + "epoch": 2.3314615267870695, + "grad_norm": 0.7861149907112122, + "learning_rate": 6.116735850966892e-05, + "loss": 1.3012, + "step": 7681 + }, + { + "epoch": 2.3317650629837607, + "grad_norm": 0.7344647645950317, + "learning_rate": 6.116229624379872e-05, + "loss": 1.5367, + "step": 7682 + }, + { + "epoch": 2.3320685991804524, + "grad_norm": 0.6908876895904541, + "learning_rate": 6.115723397792851e-05, + "loss": 0.8515, + "step": 7683 + }, + { + "epoch": 2.3323721353771436, + "grad_norm": 0.7117886543273926, + "learning_rate": 6.115217171205831e-05, + "loss": 0.9673, + "step": 7684 + }, + { + "epoch": 2.3326756715738353, + "grad_norm": 0.5932457447052002, + "learning_rate": 6.114710944618812e-05, + "loss": 1.1811, + "step": 7685 + }, + { + "epoch": 2.3329792077705265, + "grad_norm": 0.6621536612510681, + "learning_rate": 6.114204718031791e-05, + "loss": 1.4441, + "step": 7686 + }, + { + "epoch": 2.333282743967218, + "grad_norm": 0.7571014761924744, + "learning_rate": 6.113698491444772e-05, + "loss": 1.541, + "step": 7687 + }, + { + "epoch": 2.3335862801639093, + "grad_norm": 0.8175387978553772, + "learning_rate": 6.113192264857752e-05, + "loss": 1.1661, + "step": 7688 + }, + { + "epoch": 2.333889816360601, + "grad_norm": 0.7066230773925781, + "learning_rate": 6.112686038270731e-05, + "loss": 1.4773, + "step": 7689 + }, + { + "epoch": 2.3341933525572927, + "grad_norm": 0.9288036227226257, + "learning_rate": 6.11217981168371e-05, + "loss": 1.276, + "step": 7690 + }, + { + "epoch": 2.334496888753984, + "grad_norm": 0.845954179763794, + "learning_rate": 6.11167358509669e-05, + "loss": 1.4162, + "step": 7691 + }, + { + "epoch": 2.3348004249506755, + "grad_norm": 0.9958683252334595, + "learning_rate": 6.11116735850967e-05, + "loss": 1.3755, + "step": 7692 + }, + { + "epoch": 2.3351039611473667, + "grad_norm": 0.687445878982544, + "learning_rate": 6.110661131922649e-05, + "loss": 1.3896, + "step": 7693 + }, + { + "epoch": 2.3354074973440584, + "grad_norm": 0.651056170463562, + "learning_rate": 6.110154905335628e-05, + "loss": 1.793, + "step": 7694 + }, + { + "epoch": 2.3357110335407496, + "grad_norm": 0.7116444110870361, + "learning_rate": 6.109648678748608e-05, + "loss": 1.0677, + "step": 7695 + }, + { + "epoch": 2.3360145697374413, + "grad_norm": 0.7705413103103638, + "learning_rate": 6.109142452161589e-05, + "loss": 1.2177, + "step": 7696 + }, + { + "epoch": 2.3363181059341325, + "grad_norm": 0.7890743017196655, + "learning_rate": 6.108636225574568e-05, + "loss": 1.2361, + "step": 7697 + }, + { + "epoch": 2.336621642130824, + "grad_norm": 0.6681506633758545, + "learning_rate": 6.108129998987548e-05, + "loss": 1.5302, + "step": 7698 + }, + { + "epoch": 2.3369251783275153, + "grad_norm": 0.9150996804237366, + "learning_rate": 6.107623772400527e-05, + "loss": 1.3499, + "step": 7699 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.9253705739974976, + "learning_rate": 6.107117545813507e-05, + "loss": 0.9038, + "step": 7700 + }, + { + "epoch": 2.3375322507208987, + "grad_norm": 1.0936262607574463, + "learning_rate": 6.106611319226486e-05, + "loss": 0.917, + "step": 7701 + }, + { + "epoch": 2.33783578691759, + "grad_norm": 0.8515232801437378, + "learning_rate": 6.106105092639466e-05, + "loss": 1.1466, + "step": 7702 + }, + { + "epoch": 2.3381393231142815, + "grad_norm": 0.8388434052467346, + "learning_rate": 6.105598866052445e-05, + "loss": 1.657, + "step": 7703 + }, + { + "epoch": 2.3384428593109727, + "grad_norm": 0.8349151015281677, + "learning_rate": 6.105092639465425e-05, + "loss": 0.7792, + "step": 7704 + }, + { + "epoch": 2.3387463955076644, + "grad_norm": 0.7993485331535339, + "learning_rate": 6.104586412878404e-05, + "loss": 1.4357, + "step": 7705 + }, + { + "epoch": 2.3390499317043556, + "grad_norm": 0.6768009662628174, + "learning_rate": 6.104080186291385e-05, + "loss": 1.7998, + "step": 7706 + }, + { + "epoch": 2.3393534679010473, + "grad_norm": 0.9533233046531677, + "learning_rate": 6.103573959704364e-05, + "loss": 0.5943, + "step": 7707 + }, + { + "epoch": 2.3396570040977385, + "grad_norm": 0.787339448928833, + "learning_rate": 6.103067733117344e-05, + "loss": 1.171, + "step": 7708 + }, + { + "epoch": 2.33996054029443, + "grad_norm": 0.6218050122261047, + "learning_rate": 6.102561506530323e-05, + "loss": 1.3632, + "step": 7709 + }, + { + "epoch": 2.3402640764911213, + "grad_norm": 0.7959072589874268, + "learning_rate": 6.102055279943303e-05, + "loss": 1.4095, + "step": 7710 + }, + { + "epoch": 2.340567612687813, + "grad_norm": 0.6838813424110413, + "learning_rate": 6.101549053356282e-05, + "loss": 1.4838, + "step": 7711 + }, + { + "epoch": 2.3408711488845046, + "grad_norm": 0.5955168604850769, + "learning_rate": 6.101042826769262e-05, + "loss": 1.0594, + "step": 7712 + }, + { + "epoch": 2.341174685081196, + "grad_norm": 0.7063366174697876, + "learning_rate": 6.100536600182242e-05, + "loss": 1.2897, + "step": 7713 + }, + { + "epoch": 2.3414782212778875, + "grad_norm": 0.707691490650177, + "learning_rate": 6.1000303735952214e-05, + "loss": 1.3633, + "step": 7714 + }, + { + "epoch": 2.3417817574745787, + "grad_norm": 0.8361237645149231, + "learning_rate": 6.099524147008201e-05, + "loss": 1.0799, + "step": 7715 + }, + { + "epoch": 2.3420852936712704, + "grad_norm": 0.8068976402282715, + "learning_rate": 6.0990179204211804e-05, + "loss": 1.0683, + "step": 7716 + }, + { + "epoch": 2.3423888298679616, + "grad_norm": 0.6763870716094971, + "learning_rate": 6.0985116938341605e-05, + "loss": 1.7407, + "step": 7717 + }, + { + "epoch": 2.3426923660646533, + "grad_norm": 0.9038770198822021, + "learning_rate": 6.09800546724714e-05, + "loss": 1.0923, + "step": 7718 + }, + { + "epoch": 2.342995902261345, + "grad_norm": 0.7028255462646484, + "learning_rate": 6.0974992406601195e-05, + "loss": 1.1877, + "step": 7719 + }, + { + "epoch": 2.343299438458036, + "grad_norm": 0.8663949966430664, + "learning_rate": 6.096993014073099e-05, + "loss": 1.6386, + "step": 7720 + }, + { + "epoch": 2.3436029746547273, + "grad_norm": 1.0049313306808472, + "learning_rate": 6.0964867874860784e-05, + "loss": 1.1968, + "step": 7721 + }, + { + "epoch": 2.343906510851419, + "grad_norm": 0.9146410226821899, + "learning_rate": 6.0959805608990586e-05, + "loss": 1.3352, + "step": 7722 + }, + { + "epoch": 2.3442100470481106, + "grad_norm": 0.913068413734436, + "learning_rate": 6.095474334312038e-05, + "loss": 1.2722, + "step": 7723 + }, + { + "epoch": 2.344513583244802, + "grad_norm": 0.7167338132858276, + "learning_rate": 6.0949681077250176e-05, + "loss": 1.0658, + "step": 7724 + }, + { + "epoch": 2.3448171194414935, + "grad_norm": 0.557360827922821, + "learning_rate": 6.094461881137997e-05, + "loss": 1.1557, + "step": 7725 + }, + { + "epoch": 2.3451206556381847, + "grad_norm": 0.6517195701599121, + "learning_rate": 6.0939556545509765e-05, + "loss": 1.5971, + "step": 7726 + }, + { + "epoch": 2.3454241918348764, + "grad_norm": 0.8067708611488342, + "learning_rate": 6.093449427963957e-05, + "loss": 0.9163, + "step": 7727 + }, + { + "epoch": 2.3457277280315676, + "grad_norm": 0.7419809699058533, + "learning_rate": 6.092943201376936e-05, + "loss": 1.6252, + "step": 7728 + }, + { + "epoch": 2.3460312642282592, + "grad_norm": 0.6952763795852661, + "learning_rate": 6.0924369747899156e-05, + "loss": 1.4066, + "step": 7729 + }, + { + "epoch": 2.346334800424951, + "grad_norm": 0.9591590166091919, + "learning_rate": 6.0919307482028965e-05, + "loss": 1.0623, + "step": 7730 + }, + { + "epoch": 2.346638336621642, + "grad_norm": 0.563372790813446, + "learning_rate": 6.091424521615876e-05, + "loss": 1.1935, + "step": 7731 + }, + { + "epoch": 2.3469418728183338, + "grad_norm": 0.6305344104766846, + "learning_rate": 6.0909182950288554e-05, + "loss": 1.5607, + "step": 7732 + }, + { + "epoch": 2.347245409015025, + "grad_norm": 0.6717035174369812, + "learning_rate": 6.0904120684418356e-05, + "loss": 1.2191, + "step": 7733 + }, + { + "epoch": 2.3475489452117166, + "grad_norm": 0.7762973308563232, + "learning_rate": 6.089905841854815e-05, + "loss": 1.4233, + "step": 7734 + }, + { + "epoch": 2.347852481408408, + "grad_norm": 0.8079813122749329, + "learning_rate": 6.0893996152677945e-05, + "loss": 1.3683, + "step": 7735 + }, + { + "epoch": 2.3481560176050995, + "grad_norm": 0.5496336221694946, + "learning_rate": 6.088893388680774e-05, + "loss": 0.9201, + "step": 7736 + }, + { + "epoch": 2.3484595538017907, + "grad_norm": 0.863309919834137, + "learning_rate": 6.0883871620937535e-05, + "loss": 0.6286, + "step": 7737 + }, + { + "epoch": 2.3487630899984824, + "grad_norm": 0.8281985521316528, + "learning_rate": 6.0878809355067337e-05, + "loss": 1.04, + "step": 7738 + }, + { + "epoch": 2.3490666261951736, + "grad_norm": 0.7266848087310791, + "learning_rate": 6.087374708919713e-05, + "loss": 1.3901, + "step": 7739 + }, + { + "epoch": 2.3493701623918652, + "grad_norm": 0.9094659686088562, + "learning_rate": 6.0868684823326926e-05, + "loss": 1.1477, + "step": 7740 + }, + { + "epoch": 2.349673698588557, + "grad_norm": 0.7695726156234741, + "learning_rate": 6.086362255745672e-05, + "loss": 1.4168, + "step": 7741 + }, + { + "epoch": 2.349977234785248, + "grad_norm": 0.8188008666038513, + "learning_rate": 6.085856029158652e-05, + "loss": 1.2717, + "step": 7742 + }, + { + "epoch": 2.3502807709819398, + "grad_norm": 0.8061943650245667, + "learning_rate": 6.085349802571632e-05, + "loss": 1.5625, + "step": 7743 + }, + { + "epoch": 2.350584307178631, + "grad_norm": 0.7862029671669006, + "learning_rate": 6.084843575984611e-05, + "loss": 1.5496, + "step": 7744 + }, + { + "epoch": 2.3508878433753226, + "grad_norm": 0.6617637872695923, + "learning_rate": 6.084337349397591e-05, + "loss": 1.633, + "step": 7745 + }, + { + "epoch": 2.351191379572014, + "grad_norm": 0.9673377871513367, + "learning_rate": 6.08383112281057e-05, + "loss": 0.8355, + "step": 7746 + }, + { + "epoch": 2.3514949157687055, + "grad_norm": 0.7346013188362122, + "learning_rate": 6.08332489622355e-05, + "loss": 1.3608, + "step": 7747 + }, + { + "epoch": 2.3517984519653967, + "grad_norm": 0.8306134343147278, + "learning_rate": 6.08281866963653e-05, + "loss": 1.3994, + "step": 7748 + }, + { + "epoch": 2.3521019881620884, + "grad_norm": 0.7612175941467285, + "learning_rate": 6.082312443049509e-05, + "loss": 1.0089, + "step": 7749 + }, + { + "epoch": 2.3524055243587796, + "grad_norm": 0.8626055121421814, + "learning_rate": 6.081806216462489e-05, + "loss": 1.2201, + "step": 7750 + }, + { + "epoch": 2.3527090605554712, + "grad_norm": 0.613272488117218, + "learning_rate": 6.081299989875468e-05, + "loss": 1.0982, + "step": 7751 + }, + { + "epoch": 2.353012596752163, + "grad_norm": 0.6248640418052673, + "learning_rate": 6.0807937632884484e-05, + "loss": 1.4992, + "step": 7752 + }, + { + "epoch": 2.353316132948854, + "grad_norm": 0.7593119740486145, + "learning_rate": 6.080287536701428e-05, + "loss": 1.3339, + "step": 7753 + }, + { + "epoch": 2.3536196691455458, + "grad_norm": 0.6562939286231995, + "learning_rate": 6.0797813101144074e-05, + "loss": 1.4767, + "step": 7754 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.6290830969810486, + "learning_rate": 6.079275083527387e-05, + "loss": 1.6008, + "step": 7755 + }, + { + "epoch": 2.3542267415389286, + "grad_norm": 0.6489423513412476, + "learning_rate": 6.078768856940367e-05, + "loss": 1.5682, + "step": 7756 + }, + { + "epoch": 2.35453027773562, + "grad_norm": 0.8090351819992065, + "learning_rate": 6.0782626303533465e-05, + "loss": 1.2493, + "step": 7757 + }, + { + "epoch": 2.3548338139323115, + "grad_norm": 0.7439088821411133, + "learning_rate": 6.077756403766326e-05, + "loss": 1.0664, + "step": 7758 + }, + { + "epoch": 2.3551373501290027, + "grad_norm": 0.8158544898033142, + "learning_rate": 6.0772501771793054e-05, + "loss": 0.9277, + "step": 7759 + }, + { + "epoch": 2.3554408863256944, + "grad_norm": 0.6496466398239136, + "learning_rate": 6.076743950592285e-05, + "loss": 1.5302, + "step": 7760 + }, + { + "epoch": 2.3557444225223856, + "grad_norm": 0.7689223885536194, + "learning_rate": 6.076237724005265e-05, + "loss": 1.0756, + "step": 7761 + }, + { + "epoch": 2.3560479587190772, + "grad_norm": 0.8746340274810791, + "learning_rate": 6.0757314974182445e-05, + "loss": 1.1543, + "step": 7762 + }, + { + "epoch": 2.356351494915769, + "grad_norm": 0.6918237209320068, + "learning_rate": 6.075225270831224e-05, + "loss": 1.3064, + "step": 7763 + }, + { + "epoch": 2.35665503111246, + "grad_norm": 0.6862085461616516, + "learning_rate": 6.0747190442442035e-05, + "loss": 1.6043, + "step": 7764 + }, + { + "epoch": 2.3569585673091518, + "grad_norm": 0.8114455342292786, + "learning_rate": 6.074212817657183e-05, + "loss": 1.5732, + "step": 7765 + }, + { + "epoch": 2.357262103505843, + "grad_norm": 1.0163137912750244, + "learning_rate": 6.073706591070163e-05, + "loss": 1.098, + "step": 7766 + }, + { + "epoch": 2.3575656397025346, + "grad_norm": 0.6978328824043274, + "learning_rate": 6.0732003644831426e-05, + "loss": 1.2409, + "step": 7767 + }, + { + "epoch": 2.357869175899226, + "grad_norm": 0.7484824061393738, + "learning_rate": 6.072694137896122e-05, + "loss": 1.3748, + "step": 7768 + }, + { + "epoch": 2.3581727120959175, + "grad_norm": 0.8369539380073547, + "learning_rate": 6.0721879113091016e-05, + "loss": 1.4558, + "step": 7769 + }, + { + "epoch": 2.3584762482926087, + "grad_norm": 0.7077987194061279, + "learning_rate": 6.071681684722082e-05, + "loss": 1.0913, + "step": 7770 + }, + { + "epoch": 2.3587797844893004, + "grad_norm": 0.8944743275642395, + "learning_rate": 6.071175458135061e-05, + "loss": 1.2509, + "step": 7771 + }, + { + "epoch": 2.3590833206859916, + "grad_norm": 0.5822862982749939, + "learning_rate": 6.070669231548041e-05, + "loss": 0.7585, + "step": 7772 + }, + { + "epoch": 2.3593868568826832, + "grad_norm": 0.7083290815353394, + "learning_rate": 6.07016300496102e-05, + "loss": 1.6672, + "step": 7773 + }, + { + "epoch": 2.359690393079375, + "grad_norm": 0.7272571921348572, + "learning_rate": 6.069656778374001e-05, + "loss": 1.4634, + "step": 7774 + }, + { + "epoch": 2.359993929276066, + "grad_norm": 0.7781559824943542, + "learning_rate": 6.0691505517869805e-05, + "loss": 0.7688, + "step": 7775 + }, + { + "epoch": 2.3602974654727578, + "grad_norm": 0.7398669719696045, + "learning_rate": 6.06864432519996e-05, + "loss": 1.5047, + "step": 7776 + }, + { + "epoch": 2.360601001669449, + "grad_norm": 0.5970739722251892, + "learning_rate": 6.06813809861294e-05, + "loss": 1.6261, + "step": 7777 + }, + { + "epoch": 2.3609045378661406, + "grad_norm": 0.8803917169570923, + "learning_rate": 6.0676318720259196e-05, + "loss": 1.3531, + "step": 7778 + }, + { + "epoch": 2.361208074062832, + "grad_norm": 0.6425435543060303, + "learning_rate": 6.067125645438899e-05, + "loss": 1.2065, + "step": 7779 + }, + { + "epoch": 2.3615116102595235, + "grad_norm": 0.6474995017051697, + "learning_rate": 6.0666194188518786e-05, + "loss": 1.7477, + "step": 7780 + }, + { + "epoch": 2.361815146456215, + "grad_norm": 0.8857691884040833, + "learning_rate": 6.066113192264859e-05, + "loss": 1.5678, + "step": 7781 + }, + { + "epoch": 2.3621186826529064, + "grad_norm": 1.0378564596176147, + "learning_rate": 6.065606965677838e-05, + "loss": 1.3457, + "step": 7782 + }, + { + "epoch": 2.3624222188495976, + "grad_norm": 0.6623472571372986, + "learning_rate": 6.065100739090818e-05, + "loss": 1.3708, + "step": 7783 + }, + { + "epoch": 2.3627257550462892, + "grad_norm": 0.7117280960083008, + "learning_rate": 6.064594512503797e-05, + "loss": 1.5782, + "step": 7784 + }, + { + "epoch": 2.363029291242981, + "grad_norm": 0.6938657164573669, + "learning_rate": 6.0640882859167766e-05, + "loss": 0.8125, + "step": 7785 + }, + { + "epoch": 2.363332827439672, + "grad_norm": 0.659076988697052, + "learning_rate": 6.063582059329757e-05, + "loss": 1.483, + "step": 7786 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.7740662693977356, + "learning_rate": 6.063075832742736e-05, + "loss": 1.4259, + "step": 7787 + }, + { + "epoch": 2.363939899833055, + "grad_norm": 0.7431475520133972, + "learning_rate": 6.062569606155716e-05, + "loss": 1.6874, + "step": 7788 + }, + { + "epoch": 2.3642434360297466, + "grad_norm": 0.8177775740623474, + "learning_rate": 6.062063379568695e-05, + "loss": 1.2511, + "step": 7789 + }, + { + "epoch": 2.364546972226438, + "grad_norm": 0.7437436580657959, + "learning_rate": 6.061557152981675e-05, + "loss": 1.3728, + "step": 7790 + }, + { + "epoch": 2.3648505084231295, + "grad_norm": 0.7326698899269104, + "learning_rate": 6.061050926394655e-05, + "loss": 1.3606, + "step": 7791 + }, + { + "epoch": 2.365154044619821, + "grad_norm": 0.7051557302474976, + "learning_rate": 6.0605446998076344e-05, + "loss": 1.7839, + "step": 7792 + }, + { + "epoch": 2.3654575808165124, + "grad_norm": 0.9070965051651001, + "learning_rate": 6.060038473220614e-05, + "loss": 1.1911, + "step": 7793 + }, + { + "epoch": 2.365761117013204, + "grad_norm": 0.879751443862915, + "learning_rate": 6.059532246633593e-05, + "loss": 1.2265, + "step": 7794 + }, + { + "epoch": 2.3660646532098952, + "grad_norm": 0.7137269973754883, + "learning_rate": 6.0590260200465735e-05, + "loss": 1.1799, + "step": 7795 + }, + { + "epoch": 2.366368189406587, + "grad_norm": 0.9179060459136963, + "learning_rate": 6.058519793459553e-05, + "loss": 1.3917, + "step": 7796 + }, + { + "epoch": 2.366671725603278, + "grad_norm": 0.8052315711975098, + "learning_rate": 6.0580135668725324e-05, + "loss": 0.8737, + "step": 7797 + }, + { + "epoch": 2.3669752617999698, + "grad_norm": 0.7714252471923828, + "learning_rate": 6.057507340285512e-05, + "loss": 1.3054, + "step": 7798 + }, + { + "epoch": 2.367278797996661, + "grad_norm": 0.8135915994644165, + "learning_rate": 6.0570011136984914e-05, + "loss": 1.344, + "step": 7799 + }, + { + "epoch": 2.3675823341933526, + "grad_norm": 0.7176734209060669, + "learning_rate": 6.0564948871114715e-05, + "loss": 1.1194, + "step": 7800 + }, + { + "epoch": 2.367885870390044, + "grad_norm": 0.7938108444213867, + "learning_rate": 6.055988660524451e-05, + "loss": 1.0659, + "step": 7801 + }, + { + "epoch": 2.3681894065867355, + "grad_norm": 0.6610084176063538, + "learning_rate": 6.0554824339374305e-05, + "loss": 1.6436, + "step": 7802 + }, + { + "epoch": 2.368492942783427, + "grad_norm": 0.818847119808197, + "learning_rate": 6.05497620735041e-05, + "loss": 1.1547, + "step": 7803 + }, + { + "epoch": 2.3687964789801184, + "grad_norm": 1.013972282409668, + "learning_rate": 6.0544699807633895e-05, + "loss": 1.2886, + "step": 7804 + }, + { + "epoch": 2.36910001517681, + "grad_norm": 0.7468999028205872, + "learning_rate": 6.0539637541763696e-05, + "loss": 1.0146, + "step": 7805 + }, + { + "epoch": 2.3694035513735012, + "grad_norm": 0.692268967628479, + "learning_rate": 6.053457527589349e-05, + "loss": 1.4758, + "step": 7806 + }, + { + "epoch": 2.369707087570193, + "grad_norm": 0.6096368432044983, + "learning_rate": 6.0529513010023286e-05, + "loss": 1.2638, + "step": 7807 + }, + { + "epoch": 2.370010623766884, + "grad_norm": 0.7329953908920288, + "learning_rate": 6.052445074415308e-05, + "loss": 1.4867, + "step": 7808 + }, + { + "epoch": 2.3703141599635758, + "grad_norm": 0.7470860481262207, + "learning_rate": 6.051938847828288e-05, + "loss": 1.3024, + "step": 7809 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.8400612473487854, + "learning_rate": 6.051432621241268e-05, + "loss": 1.5507, + "step": 7810 + }, + { + "epoch": 2.3709212323569586, + "grad_norm": 0.7138293385505676, + "learning_rate": 6.050926394654247e-05, + "loss": 1.4085, + "step": 7811 + }, + { + "epoch": 2.37122476855365, + "grad_norm": 1.1816941499710083, + "learning_rate": 6.0504201680672267e-05, + "loss": 0.9353, + "step": 7812 + }, + { + "epoch": 2.3715283047503415, + "grad_norm": 0.9305357933044434, + "learning_rate": 6.049913941480206e-05, + "loss": 1.285, + "step": 7813 + }, + { + "epoch": 2.371831840947033, + "grad_norm": 0.825706422328949, + "learning_rate": 6.049407714893186e-05, + "loss": 1.3601, + "step": 7814 + }, + { + "epoch": 2.3721353771437244, + "grad_norm": 0.7762649059295654, + "learning_rate": 6.048901488306166e-05, + "loss": 1.3771, + "step": 7815 + }, + { + "epoch": 2.372438913340416, + "grad_norm": 1.1662933826446533, + "learning_rate": 6.048395261719145e-05, + "loss": 1.0773, + "step": 7816 + }, + { + "epoch": 2.3727424495371072, + "grad_norm": 0.6628431081771851, + "learning_rate": 6.047889035132125e-05, + "loss": 1.0688, + "step": 7817 + }, + { + "epoch": 2.373045985733799, + "grad_norm": 0.8049399256706238, + "learning_rate": 6.047382808545104e-05, + "loss": 1.3595, + "step": 7818 + }, + { + "epoch": 2.37334952193049, + "grad_norm": 0.7456562519073486, + "learning_rate": 6.046876581958085e-05, + "loss": 1.7107, + "step": 7819 + }, + { + "epoch": 2.3736530581271817, + "grad_norm": 0.8927516937255859, + "learning_rate": 6.046370355371065e-05, + "loss": 1.4058, + "step": 7820 + }, + { + "epoch": 2.373956594323873, + "grad_norm": 0.8116360306739807, + "learning_rate": 6.045864128784045e-05, + "loss": 1.7016, + "step": 7821 + }, + { + "epoch": 2.3742601305205646, + "grad_norm": 0.6701048612594604, + "learning_rate": 6.045357902197024e-05, + "loss": 1.7531, + "step": 7822 + }, + { + "epoch": 2.374563666717256, + "grad_norm": 0.7412038445472717, + "learning_rate": 6.0448516756100036e-05, + "loss": 1.5549, + "step": 7823 + }, + { + "epoch": 2.3748672029139475, + "grad_norm": 0.7359911203384399, + "learning_rate": 6.044345449022983e-05, + "loss": 1.3267, + "step": 7824 + }, + { + "epoch": 2.375170739110639, + "grad_norm": 0.6358724236488342, + "learning_rate": 6.043839222435963e-05, + "loss": 1.5852, + "step": 7825 + }, + { + "epoch": 2.3754742753073304, + "grad_norm": 0.715941309928894, + "learning_rate": 6.043332995848943e-05, + "loss": 0.7863, + "step": 7826 + }, + { + "epoch": 2.375777811504022, + "grad_norm": 0.5403160452842712, + "learning_rate": 6.042826769261922e-05, + "loss": 1.6579, + "step": 7827 + }, + { + "epoch": 2.376081347700713, + "grad_norm": 0.5860801935195923, + "learning_rate": 6.042320542674902e-05, + "loss": 0.6214, + "step": 7828 + }, + { + "epoch": 2.376384883897405, + "grad_norm": 0.7888510227203369, + "learning_rate": 6.041814316087881e-05, + "loss": 1.262, + "step": 7829 + }, + { + "epoch": 2.376688420094096, + "grad_norm": 0.870266318321228, + "learning_rate": 6.0413080895008613e-05, + "loss": 1.3956, + "step": 7830 + }, + { + "epoch": 2.3769919562907877, + "grad_norm": 0.6735515594482422, + "learning_rate": 6.040801862913841e-05, + "loss": 0.8394, + "step": 7831 + }, + { + "epoch": 2.377295492487479, + "grad_norm": 0.7404083609580994, + "learning_rate": 6.04029563632682e-05, + "loss": 1.3891, + "step": 7832 + }, + { + "epoch": 2.3775990286841706, + "grad_norm": 0.6634083986282349, + "learning_rate": 6.0397894097398e-05, + "loss": 1.4729, + "step": 7833 + }, + { + "epoch": 2.377902564880862, + "grad_norm": 1.2230887413024902, + "learning_rate": 6.03928318315278e-05, + "loss": 1.3533, + "step": 7834 + }, + { + "epoch": 2.3782061010775535, + "grad_norm": 0.723315417766571, + "learning_rate": 6.0387769565657594e-05, + "loss": 1.4642, + "step": 7835 + }, + { + "epoch": 2.378509637274245, + "grad_norm": 0.8489314913749695, + "learning_rate": 6.038270729978739e-05, + "loss": 1.7669, + "step": 7836 + }, + { + "epoch": 2.3788131734709363, + "grad_norm": 0.8420275449752808, + "learning_rate": 6.0377645033917184e-05, + "loss": 1.3251, + "step": 7837 + }, + { + "epoch": 2.379116709667628, + "grad_norm": 0.663590669631958, + "learning_rate": 6.037258276804698e-05, + "loss": 1.2515, + "step": 7838 + }, + { + "epoch": 2.379420245864319, + "grad_norm": 0.7389885187149048, + "learning_rate": 6.036752050217678e-05, + "loss": 1.0688, + "step": 7839 + }, + { + "epoch": 2.379723782061011, + "grad_norm": 0.7165906429290771, + "learning_rate": 6.0362458236306575e-05, + "loss": 0.8493, + "step": 7840 + }, + { + "epoch": 2.380027318257702, + "grad_norm": 0.7658936977386475, + "learning_rate": 6.035739597043637e-05, + "loss": 1.2728, + "step": 7841 + }, + { + "epoch": 2.3803308544543937, + "grad_norm": 0.9110761880874634, + "learning_rate": 6.0352333704566165e-05, + "loss": 1.1135, + "step": 7842 + }, + { + "epoch": 2.380634390651085, + "grad_norm": 0.7847999334335327, + "learning_rate": 6.034727143869596e-05, + "loss": 1.5611, + "step": 7843 + }, + { + "epoch": 2.3809379268477766, + "grad_norm": 0.6586335301399231, + "learning_rate": 6.034220917282576e-05, + "loss": 1.5972, + "step": 7844 + }, + { + "epoch": 2.381241463044468, + "grad_norm": 0.7695474624633789, + "learning_rate": 6.0337146906955556e-05, + "loss": 1.0394, + "step": 7845 + }, + { + "epoch": 2.3815449992411595, + "grad_norm": 0.7837185859680176, + "learning_rate": 6.033208464108535e-05, + "loss": 1.1378, + "step": 7846 + }, + { + "epoch": 2.381848535437851, + "grad_norm": 0.9874072670936584, + "learning_rate": 6.0327022375215145e-05, + "loss": 1.0721, + "step": 7847 + }, + { + "epoch": 2.3821520716345423, + "grad_norm": 0.6492806673049927, + "learning_rate": 6.032196010934495e-05, + "loss": 1.3812, + "step": 7848 + }, + { + "epoch": 2.382455607831234, + "grad_norm": 0.8687458634376526, + "learning_rate": 6.031689784347474e-05, + "loss": 1.4708, + "step": 7849 + }, + { + "epoch": 2.382759144027925, + "grad_norm": 0.9347490072250366, + "learning_rate": 6.0311835577604536e-05, + "loss": 1.512, + "step": 7850 + }, + { + "epoch": 2.383062680224617, + "grad_norm": 0.7700411081314087, + "learning_rate": 6.030677331173433e-05, + "loss": 1.1627, + "step": 7851 + }, + { + "epoch": 2.383366216421308, + "grad_norm": 0.4575173854827881, + "learning_rate": 6.0301711045864126e-05, + "loss": 0.7804, + "step": 7852 + }, + { + "epoch": 2.3836697526179997, + "grad_norm": 0.7945066690444946, + "learning_rate": 6.029664877999393e-05, + "loss": 1.2632, + "step": 7853 + }, + { + "epoch": 2.3839732888146914, + "grad_norm": 0.8826307058334351, + "learning_rate": 6.029158651412372e-05, + "loss": 1.4142, + "step": 7854 + }, + { + "epoch": 2.3842768250113826, + "grad_norm": 0.9518096446990967, + "learning_rate": 6.028652424825352e-05, + "loss": 1.0018, + "step": 7855 + }, + { + "epoch": 2.384580361208074, + "grad_norm": 0.742034912109375, + "learning_rate": 6.028146198238331e-05, + "loss": 1.5479, + "step": 7856 + }, + { + "epoch": 2.3848838974047655, + "grad_norm": 0.7931845784187317, + "learning_rate": 6.027639971651311e-05, + "loss": 1.3477, + "step": 7857 + }, + { + "epoch": 2.385187433601457, + "grad_norm": 0.8038697242736816, + "learning_rate": 6.027133745064291e-05, + "loss": 1.0658, + "step": 7858 + }, + { + "epoch": 2.3854909697981483, + "grad_norm": 0.7707939743995667, + "learning_rate": 6.02662751847727e-05, + "loss": 1.3347, + "step": 7859 + }, + { + "epoch": 2.38579450599484, + "grad_norm": 0.6021765470504761, + "learning_rate": 6.02612129189025e-05, + "loss": 1.4294, + "step": 7860 + }, + { + "epoch": 2.386098042191531, + "grad_norm": 0.578547477722168, + "learning_rate": 6.025615065303229e-05, + "loss": 1.4683, + "step": 7861 + }, + { + "epoch": 2.386401578388223, + "grad_norm": 0.8599765300750732, + "learning_rate": 6.0251088387162094e-05, + "loss": 1.0985, + "step": 7862 + }, + { + "epoch": 2.386705114584914, + "grad_norm": 0.7970162630081177, + "learning_rate": 6.0246026121291896e-05, + "loss": 1.3855, + "step": 7863 + }, + { + "epoch": 2.3870086507816057, + "grad_norm": 1.0750303268432617, + "learning_rate": 6.02409638554217e-05, + "loss": 1.6049, + "step": 7864 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.8112868070602417, + "learning_rate": 6.023590158955149e-05, + "loss": 1.0065, + "step": 7865 + }, + { + "epoch": 2.3876157231749886, + "grad_norm": 0.7846496105194092, + "learning_rate": 6.023083932368129e-05, + "loss": 1.4306, + "step": 7866 + }, + { + "epoch": 2.3879192593716803, + "grad_norm": 0.7304663062095642, + "learning_rate": 6.022577705781108e-05, + "loss": 1.6198, + "step": 7867 + }, + { + "epoch": 2.3882227955683715, + "grad_norm": 0.8028169870376587, + "learning_rate": 6.022071479194088e-05, + "loss": 1.4573, + "step": 7868 + }, + { + "epoch": 2.388526331765063, + "grad_norm": 0.8368347883224487, + "learning_rate": 6.021565252607068e-05, + "loss": 1.1526, + "step": 7869 + }, + { + "epoch": 2.3888298679617543, + "grad_norm": 0.8738002181053162, + "learning_rate": 6.021059026020047e-05, + "loss": 1.2524, + "step": 7870 + }, + { + "epoch": 2.389133404158446, + "grad_norm": 0.7346928715705872, + "learning_rate": 6.020552799433027e-05, + "loss": 1.4939, + "step": 7871 + }, + { + "epoch": 2.389436940355137, + "grad_norm": 0.7887352108955383, + "learning_rate": 6.020046572846006e-05, + "loss": 1.3927, + "step": 7872 + }, + { + "epoch": 2.389740476551829, + "grad_norm": 1.0919454097747803, + "learning_rate": 6.0195403462589864e-05, + "loss": 1.0354, + "step": 7873 + }, + { + "epoch": 2.39004401274852, + "grad_norm": 0.819840133190155, + "learning_rate": 6.019034119671966e-05, + "loss": 1.7027, + "step": 7874 + }, + { + "epoch": 2.3903475489452117, + "grad_norm": 0.7620285749435425, + "learning_rate": 6.0185278930849454e-05, + "loss": 1.0801, + "step": 7875 + }, + { + "epoch": 2.3906510851419034, + "grad_norm": 0.7657316327095032, + "learning_rate": 6.018021666497925e-05, + "loss": 1.5568, + "step": 7876 + }, + { + "epoch": 2.3909546213385946, + "grad_norm": 0.6373672485351562, + "learning_rate": 6.017515439910904e-05, + "loss": 1.6544, + "step": 7877 + }, + { + "epoch": 2.3912581575352863, + "grad_norm": 0.7929403781890869, + "learning_rate": 6.0170092133238845e-05, + "loss": 1.468, + "step": 7878 + }, + { + "epoch": 2.3915616937319775, + "grad_norm": 0.9188044667243958, + "learning_rate": 6.016502986736864e-05, + "loss": 1.3987, + "step": 7879 + }, + { + "epoch": 2.391865229928669, + "grad_norm": 0.727801501750946, + "learning_rate": 6.0159967601498435e-05, + "loss": 1.5465, + "step": 7880 + }, + { + "epoch": 2.3921687661253603, + "grad_norm": 0.7766900658607483, + "learning_rate": 6.015490533562823e-05, + "loss": 1.3968, + "step": 7881 + }, + { + "epoch": 2.392472302322052, + "grad_norm": 0.7090250849723816, + "learning_rate": 6.0149843069758024e-05, + "loss": 1.4824, + "step": 7882 + }, + { + "epoch": 2.392775838518743, + "grad_norm": 0.8688755035400391, + "learning_rate": 6.0144780803887826e-05, + "loss": 1.4126, + "step": 7883 + }, + { + "epoch": 2.393079374715435, + "grad_norm": 0.6128059029579163, + "learning_rate": 6.013971853801762e-05, + "loss": 1.6804, + "step": 7884 + }, + { + "epoch": 2.393382910912126, + "grad_norm": 0.6764035820960999, + "learning_rate": 6.0134656272147415e-05, + "loss": 1.1744, + "step": 7885 + }, + { + "epoch": 2.3936864471088177, + "grad_norm": 0.8046677708625793, + "learning_rate": 6.012959400627721e-05, + "loss": 1.3921, + "step": 7886 + }, + { + "epoch": 2.3939899833055094, + "grad_norm": 0.7015335559844971, + "learning_rate": 6.012453174040701e-05, + "loss": 1.2581, + "step": 7887 + }, + { + "epoch": 2.3942935195022006, + "grad_norm": 0.7972201704978943, + "learning_rate": 6.0119469474536806e-05, + "loss": 0.6561, + "step": 7888 + }, + { + "epoch": 2.3945970556988923, + "grad_norm": 0.6351714134216309, + "learning_rate": 6.01144072086666e-05, + "loss": 1.1843, + "step": 7889 + }, + { + "epoch": 2.3949005918955835, + "grad_norm": 0.9243821501731873, + "learning_rate": 6.0109344942796396e-05, + "loss": 1.3206, + "step": 7890 + }, + { + "epoch": 2.395204128092275, + "grad_norm": 0.7279089689254761, + "learning_rate": 6.010428267692619e-05, + "loss": 1.625, + "step": 7891 + }, + { + "epoch": 2.3955076642889663, + "grad_norm": 0.7650377154350281, + "learning_rate": 6.009922041105599e-05, + "loss": 1.6475, + "step": 7892 + }, + { + "epoch": 2.395811200485658, + "grad_norm": 0.7732207775115967, + "learning_rate": 6.009415814518579e-05, + "loss": 1.426, + "step": 7893 + }, + { + "epoch": 2.396114736682349, + "grad_norm": 0.5595487356185913, + "learning_rate": 6.008909587931558e-05, + "loss": 1.6396, + "step": 7894 + }, + { + "epoch": 2.396418272879041, + "grad_norm": 0.7241140604019165, + "learning_rate": 6.008403361344538e-05, + "loss": 1.4284, + "step": 7895 + }, + { + "epoch": 2.396721809075732, + "grad_norm": 0.8329600095748901, + "learning_rate": 6.007897134757517e-05, + "loss": 1.6604, + "step": 7896 + }, + { + "epoch": 2.3970253452724237, + "grad_norm": 0.776324450969696, + "learning_rate": 6.007390908170497e-05, + "loss": 1.2348, + "step": 7897 + }, + { + "epoch": 2.3973288814691154, + "grad_norm": 0.6083389520645142, + "learning_rate": 6.006884681583477e-05, + "loss": 1.5352, + "step": 7898 + }, + { + "epoch": 2.3976324176658066, + "grad_norm": 0.8004028797149658, + "learning_rate": 6.006378454996456e-05, + "loss": 1.0207, + "step": 7899 + }, + { + "epoch": 2.3979359538624982, + "grad_norm": 0.6427205801010132, + "learning_rate": 6.005872228409436e-05, + "loss": 0.9727, + "step": 7900 + }, + { + "epoch": 2.3982394900591895, + "grad_norm": 0.8049157857894897, + "learning_rate": 6.005366001822416e-05, + "loss": 1.4703, + "step": 7901 + }, + { + "epoch": 2.398543026255881, + "grad_norm": 0.6981743574142456, + "learning_rate": 6.0048597752353954e-05, + "loss": 1.6431, + "step": 7902 + }, + { + "epoch": 2.3988465624525723, + "grad_norm": 0.8600196242332458, + "learning_rate": 6.004353548648375e-05, + "loss": 1.4814, + "step": 7903 + }, + { + "epoch": 2.399150098649264, + "grad_norm": 0.8405910730361938, + "learning_rate": 6.0038473220613543e-05, + "loss": 1.395, + "step": 7904 + }, + { + "epoch": 2.399453634845955, + "grad_norm": 0.9069204330444336, + "learning_rate": 6.003341095474334e-05, + "loss": 1.2988, + "step": 7905 + }, + { + "epoch": 2.399757171042647, + "grad_norm": 0.8811964392662048, + "learning_rate": 6.002834868887314e-05, + "loss": 1.389, + "step": 7906 + }, + { + "epoch": 2.400060707239338, + "grad_norm": 0.7575058937072754, + "learning_rate": 6.0023286423002935e-05, + "loss": 1.3244, + "step": 7907 + }, + { + "epoch": 2.4003642434360297, + "grad_norm": 0.7318387031555176, + "learning_rate": 6.001822415713274e-05, + "loss": 1.3653, + "step": 7908 + }, + { + "epoch": 2.4006677796327214, + "grad_norm": 0.8620535731315613, + "learning_rate": 6.001316189126254e-05, + "loss": 1.4485, + "step": 7909 + }, + { + "epoch": 2.4009713158294126, + "grad_norm": 0.6746302843093872, + "learning_rate": 6.000809962539233e-05, + "loss": 0.9841, + "step": 7910 + }, + { + "epoch": 2.4012748520261042, + "grad_norm": 0.6638560891151428, + "learning_rate": 6.000303735952213e-05, + "loss": 0.8332, + "step": 7911 + }, + { + "epoch": 2.4015783882227955, + "grad_norm": 0.5643718242645264, + "learning_rate": 5.999797509365193e-05, + "loss": 2.039, + "step": 7912 + }, + { + "epoch": 2.401881924419487, + "grad_norm": 0.834593653678894, + "learning_rate": 5.9992912827781724e-05, + "loss": 0.9973, + "step": 7913 + }, + { + "epoch": 2.4021854606161783, + "grad_norm": 0.6542152166366577, + "learning_rate": 5.998785056191152e-05, + "loss": 1.6459, + "step": 7914 + }, + { + "epoch": 2.40248899681287, + "grad_norm": 0.8321699500083923, + "learning_rate": 5.998278829604131e-05, + "loss": 1.3346, + "step": 7915 + }, + { + "epoch": 2.4027925330095616, + "grad_norm": 0.6096453070640564, + "learning_rate": 5.997772603017111e-05, + "loss": 1.0405, + "step": 7916 + }, + { + "epoch": 2.403096069206253, + "grad_norm": 0.49466562271118164, + "learning_rate": 5.997266376430091e-05, + "loss": 1.1957, + "step": 7917 + }, + { + "epoch": 2.403399605402944, + "grad_norm": 0.691472589969635, + "learning_rate": 5.9967601498430704e-05, + "loss": 1.6998, + "step": 7918 + }, + { + "epoch": 2.4037031415996357, + "grad_norm": 0.807509183883667, + "learning_rate": 5.99625392325605e-05, + "loss": 1.3887, + "step": 7919 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.7648297548294067, + "learning_rate": 5.9957476966690294e-05, + "loss": 1.0966, + "step": 7920 + }, + { + "epoch": 2.4043102139930186, + "grad_norm": 1.026137113571167, + "learning_rate": 5.995241470082009e-05, + "loss": 0.6286, + "step": 7921 + }, + { + "epoch": 2.4046137501897102, + "grad_norm": 0.8462149500846863, + "learning_rate": 5.994735243494989e-05, + "loss": 1.3598, + "step": 7922 + }, + { + "epoch": 2.4049172863864015, + "grad_norm": 0.8571946620941162, + "learning_rate": 5.9942290169079685e-05, + "loss": 1.4567, + "step": 7923 + }, + { + "epoch": 2.405220822583093, + "grad_norm": 1.0264604091644287, + "learning_rate": 5.993722790320948e-05, + "loss": 0.9721, + "step": 7924 + }, + { + "epoch": 2.4055243587797843, + "grad_norm": 0.8708542585372925, + "learning_rate": 5.9932165637339275e-05, + "loss": 1.3932, + "step": 7925 + }, + { + "epoch": 2.405827894976476, + "grad_norm": 0.8683403134346008, + "learning_rate": 5.9927103371469076e-05, + "loss": 1.5331, + "step": 7926 + }, + { + "epoch": 2.4061314311731676, + "grad_norm": 0.9796139001846313, + "learning_rate": 5.992204110559887e-05, + "loss": 1.3447, + "step": 7927 + }, + { + "epoch": 2.406434967369859, + "grad_norm": 0.6332157254219055, + "learning_rate": 5.9916978839728666e-05, + "loss": 1.4754, + "step": 7928 + }, + { + "epoch": 2.4067385035665505, + "grad_norm": 0.9253177642822266, + "learning_rate": 5.991191657385846e-05, + "loss": 0.8832, + "step": 7929 + }, + { + "epoch": 2.4070420397632417, + "grad_norm": 0.5818787217140198, + "learning_rate": 5.9906854307988256e-05, + "loss": 1.5906, + "step": 7930 + }, + { + "epoch": 2.4073455759599334, + "grad_norm": 0.9348157048225403, + "learning_rate": 5.990179204211806e-05, + "loss": 1.1478, + "step": 7931 + }, + { + "epoch": 2.4076491121566246, + "grad_norm": 0.9060249328613281, + "learning_rate": 5.989672977624785e-05, + "loss": 1.5399, + "step": 7932 + }, + { + "epoch": 2.4079526483533162, + "grad_norm": 0.7704276442527771, + "learning_rate": 5.989166751037765e-05, + "loss": 1.1696, + "step": 7933 + }, + { + "epoch": 2.4082561845500075, + "grad_norm": 0.7888961434364319, + "learning_rate": 5.988660524450744e-05, + "loss": 1.5363, + "step": 7934 + }, + { + "epoch": 2.408559720746699, + "grad_norm": 0.6885265707969666, + "learning_rate": 5.9881542978637236e-05, + "loss": 1.3708, + "step": 7935 + }, + { + "epoch": 2.4088632569433903, + "grad_norm": 0.748163104057312, + "learning_rate": 5.987648071276704e-05, + "loss": 0.9023, + "step": 7936 + }, + { + "epoch": 2.409166793140082, + "grad_norm": 0.8010879158973694, + "learning_rate": 5.987141844689683e-05, + "loss": 1.2594, + "step": 7937 + }, + { + "epoch": 2.4094703293367736, + "grad_norm": 0.7890639305114746, + "learning_rate": 5.986635618102663e-05, + "loss": 1.6747, + "step": 7938 + }, + { + "epoch": 2.409773865533465, + "grad_norm": 0.896912693977356, + "learning_rate": 5.986129391515642e-05, + "loss": 1.2543, + "step": 7939 + }, + { + "epoch": 2.4100774017301565, + "grad_norm": 0.7643724679946899, + "learning_rate": 5.9856231649286224e-05, + "loss": 1.1527, + "step": 7940 + }, + { + "epoch": 2.4103809379268477, + "grad_norm": 0.8025280833244324, + "learning_rate": 5.985116938341602e-05, + "loss": 0.978, + "step": 7941 + }, + { + "epoch": 2.4106844741235394, + "grad_norm": 0.7779070138931274, + "learning_rate": 5.9846107117545813e-05, + "loss": 1.4496, + "step": 7942 + }, + { + "epoch": 2.4109880103202306, + "grad_norm": 0.9251778721809387, + "learning_rate": 5.984104485167561e-05, + "loss": 0.8951, + "step": 7943 + }, + { + "epoch": 2.4112915465169222, + "grad_norm": 0.7787967920303345, + "learning_rate": 5.98359825858054e-05, + "loss": 1.4073, + "step": 7944 + }, + { + "epoch": 2.4115950827136134, + "grad_norm": 0.6125165224075317, + "learning_rate": 5.9830920319935205e-05, + "loss": 1.8558, + "step": 7945 + }, + { + "epoch": 2.411898618910305, + "grad_norm": 0.8009992837905884, + "learning_rate": 5.9825858054065e-05, + "loss": 1.5551, + "step": 7946 + }, + { + "epoch": 2.4122021551069963, + "grad_norm": 0.6516717076301575, + "learning_rate": 5.9820795788194794e-05, + "loss": 0.8592, + "step": 7947 + }, + { + "epoch": 2.412505691303688, + "grad_norm": 0.7446168661117554, + "learning_rate": 5.981573352232459e-05, + "loss": 1.4455, + "step": 7948 + }, + { + "epoch": 2.4128092275003796, + "grad_norm": 0.9416190385818481, + "learning_rate": 5.9810671256454384e-05, + "loss": 1.3713, + "step": 7949 + }, + { + "epoch": 2.413112763697071, + "grad_norm": 0.7172439694404602, + "learning_rate": 5.9805608990584185e-05, + "loss": 1.3651, + "step": 7950 + }, + { + "epoch": 2.4134162998937625, + "grad_norm": 1.0124558210372925, + "learning_rate": 5.980054672471398e-05, + "loss": 1.1258, + "step": 7951 + }, + { + "epoch": 2.4137198360904537, + "grad_norm": 0.6773886680603027, + "learning_rate": 5.979548445884379e-05, + "loss": 1.6095, + "step": 7952 + }, + { + "epoch": 2.4140233722871454, + "grad_norm": 0.6818512678146362, + "learning_rate": 5.979042219297358e-05, + "loss": 1.2836, + "step": 7953 + }, + { + "epoch": 2.4143269084838366, + "grad_norm": 0.8271624445915222, + "learning_rate": 5.978535992710338e-05, + "loss": 1.1351, + "step": 7954 + }, + { + "epoch": 2.4146304446805282, + "grad_norm": 0.6633880138397217, + "learning_rate": 5.978029766123317e-05, + "loss": 1.3878, + "step": 7955 + }, + { + "epoch": 2.4149339808772194, + "grad_norm": 0.7885595560073853, + "learning_rate": 5.9775235395362974e-05, + "loss": 1.4609, + "step": 7956 + }, + { + "epoch": 2.415237517073911, + "grad_norm": 0.9626199007034302, + "learning_rate": 5.977017312949277e-05, + "loss": 1.0395, + "step": 7957 + }, + { + "epoch": 2.4155410532706023, + "grad_norm": 0.8186993598937988, + "learning_rate": 5.9765110863622564e-05, + "loss": 1.5863, + "step": 7958 + }, + { + "epoch": 2.415844589467294, + "grad_norm": 0.6804643273353577, + "learning_rate": 5.976004859775236e-05, + "loss": 1.5754, + "step": 7959 + }, + { + "epoch": 2.4161481256639856, + "grad_norm": 0.7416716814041138, + "learning_rate": 5.9754986331882154e-05, + "loss": 1.4599, + "step": 7960 + }, + { + "epoch": 2.416451661860677, + "grad_norm": 0.621061384677887, + "learning_rate": 5.9749924066011955e-05, + "loss": 1.2628, + "step": 7961 + }, + { + "epoch": 2.4167551980573685, + "grad_norm": 0.8077750205993652, + "learning_rate": 5.974486180014175e-05, + "loss": 0.9207, + "step": 7962 + }, + { + "epoch": 2.4170587342540597, + "grad_norm": 1.0338082313537598, + "learning_rate": 5.9739799534271545e-05, + "loss": 1.4273, + "step": 7963 + }, + { + "epoch": 2.4173622704507514, + "grad_norm": 0.723296046257019, + "learning_rate": 5.973473726840134e-05, + "loss": 0.8518, + "step": 7964 + }, + { + "epoch": 2.4176658066474426, + "grad_norm": 0.7224794626235962, + "learning_rate": 5.972967500253114e-05, + "loss": 1.4885, + "step": 7965 + }, + { + "epoch": 2.4179693428441342, + "grad_norm": 1.3886922597885132, + "learning_rate": 5.9724612736660936e-05, + "loss": 1.3691, + "step": 7966 + }, + { + "epoch": 2.4182728790408254, + "grad_norm": 0.7708404064178467, + "learning_rate": 5.971955047079073e-05, + "loss": 1.5227, + "step": 7967 + }, + { + "epoch": 2.418576415237517, + "grad_norm": 0.7277258038520813, + "learning_rate": 5.9714488204920526e-05, + "loss": 1.4639, + "step": 7968 + }, + { + "epoch": 2.4188799514342083, + "grad_norm": 0.7625682353973389, + "learning_rate": 5.970942593905032e-05, + "loss": 0.9221, + "step": 7969 + }, + { + "epoch": 2.4191834876309, + "grad_norm": 0.5811772346496582, + "learning_rate": 5.970436367318012e-05, + "loss": 0.7974, + "step": 7970 + }, + { + "epoch": 2.4194870238275916, + "grad_norm": 0.7587955594062805, + "learning_rate": 5.969930140730992e-05, + "loss": 1.0833, + "step": 7971 + }, + { + "epoch": 2.419790560024283, + "grad_norm": 0.6895954608917236, + "learning_rate": 5.969423914143971e-05, + "loss": 1.5106, + "step": 7972 + }, + { + "epoch": 2.4200940962209745, + "grad_norm": 1.08235764503479, + "learning_rate": 5.9689176875569506e-05, + "loss": 1.1488, + "step": 7973 + }, + { + "epoch": 2.4203976324176657, + "grad_norm": 0.713238537311554, + "learning_rate": 5.96841146096993e-05, + "loss": 1.4101, + "step": 7974 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.7801069617271423, + "learning_rate": 5.96790523438291e-05, + "loss": 1.2996, + "step": 7975 + }, + { + "epoch": 2.4210047048110486, + "grad_norm": 0.8136372566223145, + "learning_rate": 5.96739900779589e-05, + "loss": 1.1243, + "step": 7976 + }, + { + "epoch": 2.4213082410077402, + "grad_norm": 0.9092775583267212, + "learning_rate": 5.966892781208869e-05, + "loss": 1.157, + "step": 7977 + }, + { + "epoch": 2.4216117772044314, + "grad_norm": 0.7236045002937317, + "learning_rate": 5.966386554621849e-05, + "loss": 1.2423, + "step": 7978 + }, + { + "epoch": 2.421915313401123, + "grad_norm": 0.68597811460495, + "learning_rate": 5.965880328034829e-05, + "loss": 1.5961, + "step": 7979 + }, + { + "epoch": 2.4222188495978143, + "grad_norm": 0.8136271834373474, + "learning_rate": 5.965374101447808e-05, + "loss": 0.96, + "step": 7980 + }, + { + "epoch": 2.422522385794506, + "grad_norm": 0.6901982426643372, + "learning_rate": 5.964867874860788e-05, + "loss": 1.7028, + "step": 7981 + }, + { + "epoch": 2.4228259219911976, + "grad_norm": 0.5978294014930725, + "learning_rate": 5.964361648273767e-05, + "loss": 1.3612, + "step": 7982 + }, + { + "epoch": 2.423129458187889, + "grad_norm": 0.8175364136695862, + "learning_rate": 5.963855421686747e-05, + "loss": 1.1101, + "step": 7983 + }, + { + "epoch": 2.4234329943845805, + "grad_norm": 0.6936042904853821, + "learning_rate": 5.963349195099727e-05, + "loss": 0.793, + "step": 7984 + }, + { + "epoch": 2.4237365305812717, + "grad_norm": 0.8167652487754822, + "learning_rate": 5.9628429685127064e-05, + "loss": 1.1589, + "step": 7985 + }, + { + "epoch": 2.4240400667779634, + "grad_norm": 0.7972689867019653, + "learning_rate": 5.962336741925686e-05, + "loss": 1.1974, + "step": 7986 + }, + { + "epoch": 2.4243436029746546, + "grad_norm": 0.8004547357559204, + "learning_rate": 5.9618305153386654e-05, + "loss": 1.6435, + "step": 7987 + }, + { + "epoch": 2.424647139171346, + "grad_norm": 0.8520717024803162, + "learning_rate": 5.961324288751645e-05, + "loss": 0.866, + "step": 7988 + }, + { + "epoch": 2.424950675368038, + "grad_norm": 0.9278232455253601, + "learning_rate": 5.960818062164625e-05, + "loss": 1.425, + "step": 7989 + }, + { + "epoch": 2.425254211564729, + "grad_norm": 0.5844679474830627, + "learning_rate": 5.9603118355776045e-05, + "loss": 0.9303, + "step": 7990 + }, + { + "epoch": 2.4255577477614203, + "grad_norm": 0.6622089743614197, + "learning_rate": 5.959805608990584e-05, + "loss": 1.0846, + "step": 7991 + }, + { + "epoch": 2.425861283958112, + "grad_norm": 0.7942777872085571, + "learning_rate": 5.9592993824035634e-05, + "loss": 1.6953, + "step": 7992 + }, + { + "epoch": 2.4261648201548036, + "grad_norm": 0.9262164235115051, + "learning_rate": 5.9587931558165436e-05, + "loss": 1.1783, + "step": 7993 + }, + { + "epoch": 2.426468356351495, + "grad_norm": 0.9600434303283691, + "learning_rate": 5.958286929229523e-05, + "loss": 1.4355, + "step": 7994 + }, + { + "epoch": 2.4267718925481865, + "grad_norm": 0.9729022979736328, + "learning_rate": 5.9577807026425026e-05, + "loss": 1.4183, + "step": 7995 + }, + { + "epoch": 2.4270754287448777, + "grad_norm": 0.6061367392539978, + "learning_rate": 5.957274476055482e-05, + "loss": 1.1109, + "step": 7996 + }, + { + "epoch": 2.4273789649415694, + "grad_norm": 0.7526201009750366, + "learning_rate": 5.956768249468463e-05, + "loss": 1.0544, + "step": 7997 + }, + { + "epoch": 2.4276825011382606, + "grad_norm": 0.7746148705482483, + "learning_rate": 5.9562620228814424e-05, + "loss": 1.5715, + "step": 7998 + }, + { + "epoch": 2.427986037334952, + "grad_norm": 0.701327919960022, + "learning_rate": 5.955755796294422e-05, + "loss": 1.1051, + "step": 7999 + }, + { + "epoch": 2.428289573531644, + "grad_norm": 0.6953447461128235, + "learning_rate": 5.955249569707402e-05, + "loss": 1.203, + "step": 8000 + }, + { + "epoch": 2.428593109728335, + "grad_norm": 0.7078439593315125, + "learning_rate": 5.9547433431203815e-05, + "loss": 1.1811, + "step": 8001 + }, + { + "epoch": 2.4288966459250267, + "grad_norm": 0.5841401815414429, + "learning_rate": 5.954237116533361e-05, + "loss": 1.2787, + "step": 8002 + }, + { + "epoch": 2.429200182121718, + "grad_norm": 0.8184963464736938, + "learning_rate": 5.9537308899463404e-05, + "loss": 1.1793, + "step": 8003 + }, + { + "epoch": 2.4295037183184096, + "grad_norm": 0.7969046831130981, + "learning_rate": 5.9532246633593206e-05, + "loss": 1.2765, + "step": 8004 + }, + { + "epoch": 2.429807254515101, + "grad_norm": 0.6305599808692932, + "learning_rate": 5.9527184367723e-05, + "loss": 1.6182, + "step": 8005 + }, + { + "epoch": 2.4301107907117925, + "grad_norm": 0.7845397591590881, + "learning_rate": 5.9522122101852795e-05, + "loss": 1.5286, + "step": 8006 + }, + { + "epoch": 2.4304143269084837, + "grad_norm": 0.8575507998466492, + "learning_rate": 5.951705983598259e-05, + "loss": 1.2778, + "step": 8007 + }, + { + "epoch": 2.4307178631051753, + "grad_norm": 0.6721327304840088, + "learning_rate": 5.9511997570112385e-05, + "loss": 1.4541, + "step": 8008 + }, + { + "epoch": 2.4310213993018666, + "grad_norm": 0.7807362079620361, + "learning_rate": 5.950693530424219e-05, + "loss": 1.6529, + "step": 8009 + }, + { + "epoch": 2.431324935498558, + "grad_norm": 0.7468159794807434, + "learning_rate": 5.950187303837198e-05, + "loss": 1.5048, + "step": 8010 + }, + { + "epoch": 2.43162847169525, + "grad_norm": 0.9903224110603333, + "learning_rate": 5.9496810772501776e-05, + "loss": 1.2826, + "step": 8011 + }, + { + "epoch": 2.431932007891941, + "grad_norm": 0.8126440048217773, + "learning_rate": 5.949174850663157e-05, + "loss": 1.0912, + "step": 8012 + }, + { + "epoch": 2.4322355440886327, + "grad_norm": 0.6487678289413452, + "learning_rate": 5.9486686240761366e-05, + "loss": 1.0674, + "step": 8013 + }, + { + "epoch": 2.432539080285324, + "grad_norm": 0.8460603952407837, + "learning_rate": 5.948162397489117e-05, + "loss": 1.446, + "step": 8014 + }, + { + "epoch": 2.4328426164820156, + "grad_norm": 0.9071548581123352, + "learning_rate": 5.947656170902096e-05, + "loss": 1.3949, + "step": 8015 + }, + { + "epoch": 2.433146152678707, + "grad_norm": 0.8428886532783508, + "learning_rate": 5.947149944315076e-05, + "loss": 1.3674, + "step": 8016 + }, + { + "epoch": 2.4334496888753985, + "grad_norm": 0.78566974401474, + "learning_rate": 5.946643717728055e-05, + "loss": 1.6298, + "step": 8017 + }, + { + "epoch": 2.4337532250720897, + "grad_norm": 0.796306312084198, + "learning_rate": 5.946137491141035e-05, + "loss": 1.4393, + "step": 8018 + }, + { + "epoch": 2.4340567612687813, + "grad_norm": 0.8854008316993713, + "learning_rate": 5.945631264554015e-05, + "loss": 0.7501, + "step": 8019 + }, + { + "epoch": 2.4343602974654726, + "grad_norm": 0.8512039184570312, + "learning_rate": 5.945125037966994e-05, + "loss": 1.4922, + "step": 8020 + }, + { + "epoch": 2.434663833662164, + "grad_norm": 0.719879150390625, + "learning_rate": 5.944618811379974e-05, + "loss": 1.2084, + "step": 8021 + }, + { + "epoch": 2.434967369858856, + "grad_norm": 0.8208937644958496, + "learning_rate": 5.944112584792953e-05, + "loss": 1.2646, + "step": 8022 + }, + { + "epoch": 2.435270906055547, + "grad_norm": 0.8240382075309753, + "learning_rate": 5.9436063582059334e-05, + "loss": 1.5951, + "step": 8023 + }, + { + "epoch": 2.4355744422522387, + "grad_norm": 0.7194306254386902, + "learning_rate": 5.943100131618913e-05, + "loss": 1.4623, + "step": 8024 + }, + { + "epoch": 2.43587797844893, + "grad_norm": 0.8812718987464905, + "learning_rate": 5.9425939050318924e-05, + "loss": 0.5675, + "step": 8025 + }, + { + "epoch": 2.4361815146456216, + "grad_norm": 0.9816334247589111, + "learning_rate": 5.942087678444872e-05, + "loss": 1.2629, + "step": 8026 + }, + { + "epoch": 2.436485050842313, + "grad_norm": 0.851266086101532, + "learning_rate": 5.941581451857851e-05, + "loss": 1.3655, + "step": 8027 + }, + { + "epoch": 2.4367885870390045, + "grad_norm": 0.7911189198493958, + "learning_rate": 5.9410752252708315e-05, + "loss": 1.3256, + "step": 8028 + }, + { + "epoch": 2.4370921232356957, + "grad_norm": 0.7403606176376343, + "learning_rate": 5.940568998683811e-05, + "loss": 1.1923, + "step": 8029 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.8709863424301147, + "learning_rate": 5.9400627720967904e-05, + "loss": 1.2818, + "step": 8030 + }, + { + "epoch": 2.4376991956290786, + "grad_norm": 0.7635475397109985, + "learning_rate": 5.93955654550977e-05, + "loss": 1.5084, + "step": 8031 + }, + { + "epoch": 2.43800273182577, + "grad_norm": 0.916801393032074, + "learning_rate": 5.93905031892275e-05, + "loss": 1.2785, + "step": 8032 + }, + { + "epoch": 2.438306268022462, + "grad_norm": 1.1839587688446045, + "learning_rate": 5.9385440923357296e-05, + "loss": 1.1729, + "step": 8033 + }, + { + "epoch": 2.438609804219153, + "grad_norm": 1.0023239850997925, + "learning_rate": 5.938037865748709e-05, + "loss": 1.023, + "step": 8034 + }, + { + "epoch": 2.4389133404158447, + "grad_norm": 0.6197182536125183, + "learning_rate": 5.9375316391616885e-05, + "loss": 1.5478, + "step": 8035 + }, + { + "epoch": 2.439216876612536, + "grad_norm": 0.9122412800788879, + "learning_rate": 5.937025412574668e-05, + "loss": 1.1756, + "step": 8036 + }, + { + "epoch": 2.4395204128092276, + "grad_norm": 0.8996947407722473, + "learning_rate": 5.936519185987648e-05, + "loss": 1.6531, + "step": 8037 + }, + { + "epoch": 2.439823949005919, + "grad_norm": 1.0564600229263306, + "learning_rate": 5.9360129594006276e-05, + "loss": 1.3561, + "step": 8038 + }, + { + "epoch": 2.4401274852026105, + "grad_norm": 0.8630561828613281, + "learning_rate": 5.935506732813607e-05, + "loss": 0.662, + "step": 8039 + }, + { + "epoch": 2.4404310213993017, + "grad_norm": 0.6715168356895447, + "learning_rate": 5.9350005062265866e-05, + "loss": 1.5388, + "step": 8040 + }, + { + "epoch": 2.4407345575959933, + "grad_norm": 0.7765439748764038, + "learning_rate": 5.9344942796395674e-05, + "loss": 0.7733, + "step": 8041 + }, + { + "epoch": 2.4410380937926845, + "grad_norm": 0.7207764387130737, + "learning_rate": 5.933988053052547e-05, + "loss": 1.1412, + "step": 8042 + }, + { + "epoch": 2.441341629989376, + "grad_norm": 0.6615632772445679, + "learning_rate": 5.933481826465527e-05, + "loss": 0.8974, + "step": 8043 + }, + { + "epoch": 2.441645166186068, + "grad_norm": 0.5565609335899353, + "learning_rate": 5.9329755998785065e-05, + "loss": 0.6975, + "step": 8044 + }, + { + "epoch": 2.441948702382759, + "grad_norm": 0.7616466879844666, + "learning_rate": 5.932469373291486e-05, + "loss": 1.4777, + "step": 8045 + }, + { + "epoch": 2.4422522385794507, + "grad_norm": 0.6495609283447266, + "learning_rate": 5.9319631467044655e-05, + "loss": 1.2428, + "step": 8046 + }, + { + "epoch": 2.442555774776142, + "grad_norm": 1.0401421785354614, + "learning_rate": 5.931456920117445e-05, + "loss": 1.0536, + "step": 8047 + }, + { + "epoch": 2.4428593109728336, + "grad_norm": 0.8170029520988464, + "learning_rate": 5.930950693530425e-05, + "loss": 1.3697, + "step": 8048 + }, + { + "epoch": 2.443162847169525, + "grad_norm": 0.6555508971214294, + "learning_rate": 5.9304444669434046e-05, + "loss": 1.2629, + "step": 8049 + }, + { + "epoch": 2.4434663833662165, + "grad_norm": 0.5987980961799622, + "learning_rate": 5.929938240356384e-05, + "loss": 1.61, + "step": 8050 + }, + { + "epoch": 2.443769919562908, + "grad_norm": 0.7624805569648743, + "learning_rate": 5.9294320137693636e-05, + "loss": 1.327, + "step": 8051 + }, + { + "epoch": 2.4440734557595993, + "grad_norm": 0.9335926175117493, + "learning_rate": 5.928925787182343e-05, + "loss": 0.8136, + "step": 8052 + }, + { + "epoch": 2.4443769919562905, + "grad_norm": 0.6693974733352661, + "learning_rate": 5.928419560595323e-05, + "loss": 1.6345, + "step": 8053 + }, + { + "epoch": 2.444680528152982, + "grad_norm": 0.7421702146530151, + "learning_rate": 5.927913334008303e-05, + "loss": 1.0317, + "step": 8054 + }, + { + "epoch": 2.444984064349674, + "grad_norm": 0.794100284576416, + "learning_rate": 5.927407107421282e-05, + "loss": 1.4123, + "step": 8055 + }, + { + "epoch": 2.445287600546365, + "grad_norm": 0.8165601491928101, + "learning_rate": 5.9269008808342617e-05, + "loss": 1.3916, + "step": 8056 + }, + { + "epoch": 2.4455911367430567, + "grad_norm": 0.635057270526886, + "learning_rate": 5.926394654247242e-05, + "loss": 1.3165, + "step": 8057 + }, + { + "epoch": 2.445894672939748, + "grad_norm": 0.7940477728843689, + "learning_rate": 5.925888427660221e-05, + "loss": 1.0743, + "step": 8058 + }, + { + "epoch": 2.4461982091364396, + "grad_norm": 0.8174204230308533, + "learning_rate": 5.925382201073201e-05, + "loss": 1.426, + "step": 8059 + }, + { + "epoch": 2.446501745333131, + "grad_norm": 0.7778024673461914, + "learning_rate": 5.92487597448618e-05, + "loss": 1.4384, + "step": 8060 + }, + { + "epoch": 2.4468052815298225, + "grad_norm": 0.5465704798698425, + "learning_rate": 5.92436974789916e-05, + "loss": 1.2255, + "step": 8061 + }, + { + "epoch": 2.447108817726514, + "grad_norm": 0.6376035809516907, + "learning_rate": 5.92386352131214e-05, + "loss": 1.2138, + "step": 8062 + }, + { + "epoch": 2.4474123539232053, + "grad_norm": 0.9984663128852844, + "learning_rate": 5.9233572947251194e-05, + "loss": 1.454, + "step": 8063 + }, + { + "epoch": 2.447715890119897, + "grad_norm": 0.7464830875396729, + "learning_rate": 5.922851068138099e-05, + "loss": 1.6236, + "step": 8064 + }, + { + "epoch": 2.448019426316588, + "grad_norm": 0.8578869700431824, + "learning_rate": 5.922344841551078e-05, + "loss": 0.7635, + "step": 8065 + }, + { + "epoch": 2.44832296251328, + "grad_norm": 0.8007357716560364, + "learning_rate": 5.921838614964058e-05, + "loss": 1.4443, + "step": 8066 + }, + { + "epoch": 2.448626498709971, + "grad_norm": 0.6420935988426208, + "learning_rate": 5.921332388377038e-05, + "loss": 1.4141, + "step": 8067 + }, + { + "epoch": 2.4489300349066627, + "grad_norm": 0.8005525469779968, + "learning_rate": 5.9208261617900174e-05, + "loss": 1.3446, + "step": 8068 + }, + { + "epoch": 2.449233571103354, + "grad_norm": 0.7144571542739868, + "learning_rate": 5.920319935202997e-05, + "loss": 1.1064, + "step": 8069 + }, + { + "epoch": 2.4495371073000456, + "grad_norm": 0.6875942945480347, + "learning_rate": 5.9198137086159764e-05, + "loss": 1.1621, + "step": 8070 + }, + { + "epoch": 2.449840643496737, + "grad_norm": 0.6312223076820374, + "learning_rate": 5.9193074820289566e-05, + "loss": 1.1365, + "step": 8071 + }, + { + "epoch": 2.4501441796934285, + "grad_norm": 0.8454585671424866, + "learning_rate": 5.918801255441936e-05, + "loss": 1.3123, + "step": 8072 + }, + { + "epoch": 2.45044771589012, + "grad_norm": 0.626213014125824, + "learning_rate": 5.9182950288549155e-05, + "loss": 1.3838, + "step": 8073 + }, + { + "epoch": 2.4507512520868113, + "grad_norm": 0.8456754088401794, + "learning_rate": 5.917788802267895e-05, + "loss": 0.8246, + "step": 8074 + }, + { + "epoch": 2.451054788283503, + "grad_norm": 0.9328720569610596, + "learning_rate": 5.9172825756808745e-05, + "loss": 0.8562, + "step": 8075 + }, + { + "epoch": 2.451358324480194, + "grad_norm": 0.8819989562034607, + "learning_rate": 5.9167763490938546e-05, + "loss": 1.0555, + "step": 8076 + }, + { + "epoch": 2.451661860676886, + "grad_norm": 0.5868912935256958, + "learning_rate": 5.916270122506834e-05, + "loss": 1.691, + "step": 8077 + }, + { + "epoch": 2.451965396873577, + "grad_norm": 0.6053828597068787, + "learning_rate": 5.9157638959198136e-05, + "loss": 0.7372, + "step": 8078 + }, + { + "epoch": 2.4522689330702687, + "grad_norm": 0.7905032634735107, + "learning_rate": 5.915257669332793e-05, + "loss": 0.9422, + "step": 8079 + }, + { + "epoch": 2.45257246926696, + "grad_norm": 0.6530877947807312, + "learning_rate": 5.9147514427457725e-05, + "loss": 1.5496, + "step": 8080 + }, + { + "epoch": 2.4528760054636516, + "grad_norm": 0.6365821361541748, + "learning_rate": 5.914245216158753e-05, + "loss": 1.0235, + "step": 8081 + }, + { + "epoch": 2.453179541660343, + "grad_norm": 0.8876006603240967, + "learning_rate": 5.913738989571732e-05, + "loss": 1.5601, + "step": 8082 + }, + { + "epoch": 2.4534830778570345, + "grad_norm": 0.7808222770690918, + "learning_rate": 5.913232762984712e-05, + "loss": 1.8001, + "step": 8083 + }, + { + "epoch": 2.453786614053726, + "grad_norm": 0.6512035131454468, + "learning_rate": 5.912726536397691e-05, + "loss": 1.305, + "step": 8084 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.6710624098777771, + "learning_rate": 5.912220309810671e-05, + "loss": 1.236, + "step": 8085 + }, + { + "epoch": 2.454393686447109, + "grad_norm": 0.771418035030365, + "learning_rate": 5.9117140832236515e-05, + "loss": 1.1527, + "step": 8086 + }, + { + "epoch": 2.4546972226438, + "grad_norm": 0.8948315978050232, + "learning_rate": 5.9112078566366316e-05, + "loss": 1.3296, + "step": 8087 + }, + { + "epoch": 2.455000758840492, + "grad_norm": 0.7415945529937744, + "learning_rate": 5.910701630049611e-05, + "loss": 0.934, + "step": 8088 + }, + { + "epoch": 2.455304295037183, + "grad_norm": 0.9084923267364502, + "learning_rate": 5.9101954034625906e-05, + "loss": 1.3117, + "step": 8089 + }, + { + "epoch": 2.4556078312338747, + "grad_norm": 0.8375921249389648, + "learning_rate": 5.90968917687557e-05, + "loss": 1.3953, + "step": 8090 + }, + { + "epoch": 2.455911367430566, + "grad_norm": 0.6972025632858276, + "learning_rate": 5.9091829502885495e-05, + "loss": 1.6261, + "step": 8091 + }, + { + "epoch": 2.4562149036272576, + "grad_norm": 1.3651702404022217, + "learning_rate": 5.90867672370153e-05, + "loss": 1.2544, + "step": 8092 + }, + { + "epoch": 2.456518439823949, + "grad_norm": 0.7613970637321472, + "learning_rate": 5.908170497114509e-05, + "loss": 1.312, + "step": 8093 + }, + { + "epoch": 2.4568219760206405, + "grad_norm": 0.9772050380706787, + "learning_rate": 5.9076642705274886e-05, + "loss": 1.0077, + "step": 8094 + }, + { + "epoch": 2.457125512217332, + "grad_norm": 0.775710940361023, + "learning_rate": 5.907158043940468e-05, + "loss": 1.181, + "step": 8095 + }, + { + "epoch": 2.4574290484140233, + "grad_norm": 0.7317579388618469, + "learning_rate": 5.906651817353448e-05, + "loss": 1.45, + "step": 8096 + }, + { + "epoch": 2.457732584610715, + "grad_norm": 0.7279502749443054, + "learning_rate": 5.906145590766428e-05, + "loss": 1.1328, + "step": 8097 + }, + { + "epoch": 2.458036120807406, + "grad_norm": 0.8418558835983276, + "learning_rate": 5.905639364179407e-05, + "loss": 1.1068, + "step": 8098 + }, + { + "epoch": 2.458339657004098, + "grad_norm": 0.7958865165710449, + "learning_rate": 5.905133137592387e-05, + "loss": 1.3822, + "step": 8099 + }, + { + "epoch": 2.458643193200789, + "grad_norm": 0.8171069622039795, + "learning_rate": 5.904626911005366e-05, + "loss": 1.4247, + "step": 8100 + }, + { + "epoch": 2.4589467293974807, + "grad_norm": 0.5791099667549133, + "learning_rate": 5.9041206844183464e-05, + "loss": 1.3285, + "step": 8101 + }, + { + "epoch": 2.459250265594172, + "grad_norm": 0.8275609612464905, + "learning_rate": 5.903614457831326e-05, + "loss": 1.1727, + "step": 8102 + }, + { + "epoch": 2.4595538017908636, + "grad_norm": 0.8169893622398376, + "learning_rate": 5.903108231244305e-05, + "loss": 1.2051, + "step": 8103 + }, + { + "epoch": 2.459857337987555, + "grad_norm": 0.5260883569717407, + "learning_rate": 5.902602004657285e-05, + "loss": 0.8089, + "step": 8104 + }, + { + "epoch": 2.4601608741842464, + "grad_norm": 0.7259795069694519, + "learning_rate": 5.902095778070264e-05, + "loss": 0.9742, + "step": 8105 + }, + { + "epoch": 2.460464410380938, + "grad_norm": 0.6402779817581177, + "learning_rate": 5.9015895514832444e-05, + "loss": 1.0891, + "step": 8106 + }, + { + "epoch": 2.4607679465776293, + "grad_norm": 0.7106859087944031, + "learning_rate": 5.901083324896224e-05, + "loss": 1.6534, + "step": 8107 + }, + { + "epoch": 2.461071482774321, + "grad_norm": 0.7413371205329895, + "learning_rate": 5.9005770983092034e-05, + "loss": 1.4216, + "step": 8108 + }, + { + "epoch": 2.461375018971012, + "grad_norm": 0.817121148109436, + "learning_rate": 5.900070871722183e-05, + "loss": 0.8438, + "step": 8109 + }, + { + "epoch": 2.461678555167704, + "grad_norm": 0.767145574092865, + "learning_rate": 5.899564645135163e-05, + "loss": 0.9623, + "step": 8110 + }, + { + "epoch": 2.461982091364395, + "grad_norm": 0.9798495769500732, + "learning_rate": 5.8990584185481425e-05, + "loss": 1.021, + "step": 8111 + }, + { + "epoch": 2.4622856275610867, + "grad_norm": 0.7143928408622742, + "learning_rate": 5.898552191961122e-05, + "loss": 1.6554, + "step": 8112 + }, + { + "epoch": 2.462589163757778, + "grad_norm": 0.8441614508628845, + "learning_rate": 5.8980459653741015e-05, + "loss": 0.931, + "step": 8113 + }, + { + "epoch": 2.4628926999544696, + "grad_norm": 0.5994144082069397, + "learning_rate": 5.897539738787081e-05, + "loss": 0.9296, + "step": 8114 + }, + { + "epoch": 2.463196236151161, + "grad_norm": 0.7627611756324768, + "learning_rate": 5.897033512200061e-05, + "loss": 1.5269, + "step": 8115 + }, + { + "epoch": 2.4634997723478524, + "grad_norm": 0.6816883087158203, + "learning_rate": 5.8965272856130406e-05, + "loss": 1.4144, + "step": 8116 + }, + { + "epoch": 2.463803308544544, + "grad_norm": 0.7906539440155029, + "learning_rate": 5.89602105902602e-05, + "loss": 1.273, + "step": 8117 + }, + { + "epoch": 2.4641068447412353, + "grad_norm": 0.5948166847229004, + "learning_rate": 5.8955148324389995e-05, + "loss": 1.5956, + "step": 8118 + }, + { + "epoch": 2.464410380937927, + "grad_norm": 0.6496350765228271, + "learning_rate": 5.895008605851979e-05, + "loss": 0.897, + "step": 8119 + }, + { + "epoch": 2.464713917134618, + "grad_norm": 0.7230144143104553, + "learning_rate": 5.894502379264959e-05, + "loss": 1.5053, + "step": 8120 + }, + { + "epoch": 2.46501745333131, + "grad_norm": 0.8181447982788086, + "learning_rate": 5.8939961526779387e-05, + "loss": 1.1219, + "step": 8121 + }, + { + "epoch": 2.465320989528001, + "grad_norm": 0.6964221596717834, + "learning_rate": 5.893489926090918e-05, + "loss": 1.8231, + "step": 8122 + }, + { + "epoch": 2.4656245257246927, + "grad_norm": 0.6474187970161438, + "learning_rate": 5.8929836995038976e-05, + "loss": 1.7453, + "step": 8123 + }, + { + "epoch": 2.4659280619213844, + "grad_norm": 0.8115422129631042, + "learning_rate": 5.892477472916878e-05, + "loss": 1.0226, + "step": 8124 + }, + { + "epoch": 2.4662315981180756, + "grad_norm": 0.9050765037536621, + "learning_rate": 5.891971246329857e-05, + "loss": 1.3473, + "step": 8125 + }, + { + "epoch": 2.466535134314767, + "grad_norm": 0.6008917093276978, + "learning_rate": 5.891465019742837e-05, + "loss": 1.6152, + "step": 8126 + }, + { + "epoch": 2.4668386705114584, + "grad_norm": 0.7273897528648376, + "learning_rate": 5.890958793155816e-05, + "loss": 1.5513, + "step": 8127 + }, + { + "epoch": 2.46714220670815, + "grad_norm": 0.7605626583099365, + "learning_rate": 5.890452566568796e-05, + "loss": 1.0746, + "step": 8128 + }, + { + "epoch": 2.4674457429048413, + "grad_norm": 0.7195847034454346, + "learning_rate": 5.889946339981776e-05, + "loss": 1.4435, + "step": 8129 + }, + { + "epoch": 2.467749279101533, + "grad_norm": 0.9151094555854797, + "learning_rate": 5.889440113394756e-05, + "loss": 1.4713, + "step": 8130 + }, + { + "epoch": 2.468052815298224, + "grad_norm": 0.5849425196647644, + "learning_rate": 5.888933886807736e-05, + "loss": 1.0833, + "step": 8131 + }, + { + "epoch": 2.468356351494916, + "grad_norm": 0.8793689608573914, + "learning_rate": 5.8884276602207156e-05, + "loss": 1.2899, + "step": 8132 + }, + { + "epoch": 2.468659887691607, + "grad_norm": 0.7815329432487488, + "learning_rate": 5.887921433633695e-05, + "loss": 1.4963, + "step": 8133 + }, + { + "epoch": 2.4689634238882987, + "grad_norm": 0.8761633038520813, + "learning_rate": 5.8874152070466746e-05, + "loss": 1.4107, + "step": 8134 + }, + { + "epoch": 2.4692669600849904, + "grad_norm": 0.7818664312362671, + "learning_rate": 5.886908980459655e-05, + "loss": 1.7088, + "step": 8135 + }, + { + "epoch": 2.4695704962816816, + "grad_norm": 0.8316283226013184, + "learning_rate": 5.886402753872634e-05, + "loss": 1.1892, + "step": 8136 + }, + { + "epoch": 2.4698740324783732, + "grad_norm": 0.8171117901802063, + "learning_rate": 5.885896527285614e-05, + "loss": 1.231, + "step": 8137 + }, + { + "epoch": 2.4701775686750644, + "grad_norm": 0.720648467540741, + "learning_rate": 5.885390300698593e-05, + "loss": 1.5502, + "step": 8138 + }, + { + "epoch": 2.470481104871756, + "grad_norm": 0.7011475563049316, + "learning_rate": 5.884884074111573e-05, + "loss": 1.2047, + "step": 8139 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.7628121376037598, + "learning_rate": 5.884377847524553e-05, + "loss": 1.7777, + "step": 8140 + }, + { + "epoch": 2.471088177265139, + "grad_norm": 0.7602118253707886, + "learning_rate": 5.883871620937532e-05, + "loss": 1.0339, + "step": 8141 + }, + { + "epoch": 2.47139171346183, + "grad_norm": 0.724816083908081, + "learning_rate": 5.883365394350512e-05, + "loss": 1.4265, + "step": 8142 + }, + { + "epoch": 2.471695249658522, + "grad_norm": 0.8720203042030334, + "learning_rate": 5.882859167763491e-05, + "loss": 1.3592, + "step": 8143 + }, + { + "epoch": 2.471998785855213, + "grad_norm": 0.8329253196716309, + "learning_rate": 5.882352941176471e-05, + "loss": 1.3344, + "step": 8144 + }, + { + "epoch": 2.4723023220519047, + "grad_norm": 0.6739696264266968, + "learning_rate": 5.881846714589451e-05, + "loss": 1.3349, + "step": 8145 + }, + { + "epoch": 2.4726058582485964, + "grad_norm": 0.7025054693222046, + "learning_rate": 5.8813404880024304e-05, + "loss": 1.3175, + "step": 8146 + }, + { + "epoch": 2.4729093944452876, + "grad_norm": 0.5982341170310974, + "learning_rate": 5.88083426141541e-05, + "loss": 1.4763, + "step": 8147 + }, + { + "epoch": 2.4732129306419792, + "grad_norm": 0.8387122750282288, + "learning_rate": 5.8803280348283893e-05, + "loss": 1.5114, + "step": 8148 + }, + { + "epoch": 2.4735164668386704, + "grad_norm": 0.7305405139923096, + "learning_rate": 5.8798218082413695e-05, + "loss": 0.7805, + "step": 8149 + }, + { + "epoch": 2.473820003035362, + "grad_norm": 0.7596865296363831, + "learning_rate": 5.879315581654349e-05, + "loss": 1.4535, + "step": 8150 + }, + { + "epoch": 2.4741235392320533, + "grad_norm": 0.7964050769805908, + "learning_rate": 5.8788093550673285e-05, + "loss": 1.5063, + "step": 8151 + }, + { + "epoch": 2.474427075428745, + "grad_norm": 0.8814842104911804, + "learning_rate": 5.878303128480308e-05, + "loss": 1.2121, + "step": 8152 + }, + { + "epoch": 2.474730611625436, + "grad_norm": 0.5663175582885742, + "learning_rate": 5.8777969018932874e-05, + "loss": 1.6235, + "step": 8153 + }, + { + "epoch": 2.475034147822128, + "grad_norm": 0.6952011585235596, + "learning_rate": 5.8772906753062676e-05, + "loss": 1.651, + "step": 8154 + }, + { + "epoch": 2.475337684018819, + "grad_norm": 0.9040622711181641, + "learning_rate": 5.876784448719247e-05, + "loss": 1.4159, + "step": 8155 + }, + { + "epoch": 2.4756412202155107, + "grad_norm": 0.656825840473175, + "learning_rate": 5.8762782221322265e-05, + "loss": 1.2716, + "step": 8156 + }, + { + "epoch": 2.4759447564122024, + "grad_norm": 0.7671054005622864, + "learning_rate": 5.875771995545206e-05, + "loss": 1.3454, + "step": 8157 + }, + { + "epoch": 2.4762482926088936, + "grad_norm": 0.677207887172699, + "learning_rate": 5.8752657689581855e-05, + "loss": 1.1305, + "step": 8158 + }, + { + "epoch": 2.476551828805585, + "grad_norm": 0.6357643008232117, + "learning_rate": 5.8747595423711657e-05, + "loss": 0.4722, + "step": 8159 + }, + { + "epoch": 2.4768553650022764, + "grad_norm": 0.9290893077850342, + "learning_rate": 5.874253315784145e-05, + "loss": 1.1663, + "step": 8160 + }, + { + "epoch": 2.477158901198968, + "grad_norm": 0.9151197671890259, + "learning_rate": 5.8737470891971246e-05, + "loss": 1.818, + "step": 8161 + }, + { + "epoch": 2.4774624373956593, + "grad_norm": 0.7219125628471375, + "learning_rate": 5.873240862610104e-05, + "loss": 0.9134, + "step": 8162 + }, + { + "epoch": 2.477765973592351, + "grad_norm": 0.6985493898391724, + "learning_rate": 5.872734636023084e-05, + "loss": 1.5476, + "step": 8163 + }, + { + "epoch": 2.478069509789042, + "grad_norm": 0.8199888467788696, + "learning_rate": 5.872228409436064e-05, + "loss": 1.2895, + "step": 8164 + }, + { + "epoch": 2.478373045985734, + "grad_norm": 0.7273992300033569, + "learning_rate": 5.871722182849043e-05, + "loss": 1.4617, + "step": 8165 + }, + { + "epoch": 2.478676582182425, + "grad_norm": 0.7608432173728943, + "learning_rate": 5.871215956262023e-05, + "loss": 0.5711, + "step": 8166 + }, + { + "epoch": 2.4789801183791167, + "grad_norm": 0.6103653907775879, + "learning_rate": 5.870709729675002e-05, + "loss": 0.7705, + "step": 8167 + }, + { + "epoch": 2.4792836545758083, + "grad_norm": 1.349910020828247, + "learning_rate": 5.870203503087982e-05, + "loss": 1.0938, + "step": 8168 + }, + { + "epoch": 2.4795871907724996, + "grad_norm": 0.7767732739448547, + "learning_rate": 5.869697276500962e-05, + "loss": 1.3891, + "step": 8169 + }, + { + "epoch": 2.479890726969191, + "grad_norm": 0.6857088208198547, + "learning_rate": 5.869191049913941e-05, + "loss": 1.3864, + "step": 8170 + }, + { + "epoch": 2.4801942631658824, + "grad_norm": 0.694058358669281, + "learning_rate": 5.868684823326921e-05, + "loss": 1.1043, + "step": 8171 + }, + { + "epoch": 2.480497799362574, + "grad_norm": 0.7979057431221008, + "learning_rate": 5.8681785967399e-05, + "loss": 1.3119, + "step": 8172 + }, + { + "epoch": 2.4808013355592653, + "grad_norm": 0.7226840257644653, + "learning_rate": 5.8676723701528804e-05, + "loss": 1.3461, + "step": 8173 + }, + { + "epoch": 2.481104871755957, + "grad_norm": 1.1542717218399048, + "learning_rate": 5.86716614356586e-05, + "loss": 0.9093, + "step": 8174 + }, + { + "epoch": 2.481408407952648, + "grad_norm": 0.8051673173904419, + "learning_rate": 5.866659916978841e-05, + "loss": 1.141, + "step": 8175 + }, + { + "epoch": 2.48171194414934, + "grad_norm": 0.7753127217292786, + "learning_rate": 5.86615369039182e-05, + "loss": 0.6627, + "step": 8176 + }, + { + "epoch": 2.482015480346031, + "grad_norm": 0.9074029922485352, + "learning_rate": 5.8656474638048e-05, + "loss": 1.2709, + "step": 8177 + }, + { + "epoch": 2.4823190165427227, + "grad_norm": 0.9374541640281677, + "learning_rate": 5.865141237217779e-05, + "loss": 1.0258, + "step": 8178 + }, + { + "epoch": 2.4826225527394143, + "grad_norm": 0.7733614444732666, + "learning_rate": 5.864635010630759e-05, + "loss": 1.4448, + "step": 8179 + }, + { + "epoch": 2.4829260889361056, + "grad_norm": 0.8372223973274231, + "learning_rate": 5.864128784043739e-05, + "loss": 1.4152, + "step": 8180 + }, + { + "epoch": 2.483229625132797, + "grad_norm": 1.0079917907714844, + "learning_rate": 5.863622557456718e-05, + "loss": 1.1764, + "step": 8181 + }, + { + "epoch": 2.4835331613294884, + "grad_norm": 0.8187892436981201, + "learning_rate": 5.863116330869698e-05, + "loss": 0.805, + "step": 8182 + }, + { + "epoch": 2.48383669752618, + "grad_norm": 0.618904173374176, + "learning_rate": 5.862610104282677e-05, + "loss": 1.4071, + "step": 8183 + }, + { + "epoch": 2.4841402337228713, + "grad_norm": 0.8110464811325073, + "learning_rate": 5.8621038776956574e-05, + "loss": 1.5362, + "step": 8184 + }, + { + "epoch": 2.484443769919563, + "grad_norm": 0.8422300815582275, + "learning_rate": 5.861597651108637e-05, + "loss": 0.8139, + "step": 8185 + }, + { + "epoch": 2.4847473061162546, + "grad_norm": 0.9874032139778137, + "learning_rate": 5.8610914245216163e-05, + "loss": 1.3182, + "step": 8186 + }, + { + "epoch": 2.485050842312946, + "grad_norm": 0.8307647109031677, + "learning_rate": 5.860585197934596e-05, + "loss": 0.6868, + "step": 8187 + }, + { + "epoch": 2.485354378509637, + "grad_norm": 0.6423392295837402, + "learning_rate": 5.860078971347576e-05, + "loss": 1.3965, + "step": 8188 + }, + { + "epoch": 2.4856579147063287, + "grad_norm": 0.7162687182426453, + "learning_rate": 5.8595727447605555e-05, + "loss": 0.9992, + "step": 8189 + }, + { + "epoch": 2.4859614509030203, + "grad_norm": 0.6741542816162109, + "learning_rate": 5.859066518173535e-05, + "loss": 0.7475, + "step": 8190 + }, + { + "epoch": 2.4862649870997116, + "grad_norm": 0.779192328453064, + "learning_rate": 5.8585602915865144e-05, + "loss": 1.0529, + "step": 8191 + }, + { + "epoch": 2.486568523296403, + "grad_norm": 0.6369726061820984, + "learning_rate": 5.858054064999494e-05, + "loss": 1.0056, + "step": 8192 + }, + { + "epoch": 2.4868720594930944, + "grad_norm": 2.1895089149475098, + "learning_rate": 5.857547838412474e-05, + "loss": 1.3649, + "step": 8193 + }, + { + "epoch": 2.487175595689786, + "grad_norm": 0.9455059170722961, + "learning_rate": 5.8570416118254535e-05, + "loss": 1.2689, + "step": 8194 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.7931092977523804, + "learning_rate": 5.856535385238433e-05, + "loss": 1.2326, + "step": 8195 + }, + { + "epoch": 2.487782668083169, + "grad_norm": 0.7546300888061523, + "learning_rate": 5.8560291586514125e-05, + "loss": 1.1597, + "step": 8196 + }, + { + "epoch": 2.4880862042798606, + "grad_norm": 0.7067909836769104, + "learning_rate": 5.855522932064392e-05, + "loss": 1.7046, + "step": 8197 + }, + { + "epoch": 2.488389740476552, + "grad_norm": 1.1697198152542114, + "learning_rate": 5.855016705477372e-05, + "loss": 0.7381, + "step": 8198 + }, + { + "epoch": 2.4886932766732435, + "grad_norm": 0.5753573775291443, + "learning_rate": 5.8545104788903516e-05, + "loss": 1.6113, + "step": 8199 + }, + { + "epoch": 2.4889968128699347, + "grad_norm": 0.6945953965187073, + "learning_rate": 5.854004252303331e-05, + "loss": 1.5555, + "step": 8200 + }, + { + "epoch": 2.4893003490666263, + "grad_norm": 0.7350156903266907, + "learning_rate": 5.8534980257163106e-05, + "loss": 1.5168, + "step": 8201 + }, + { + "epoch": 2.4896038852633176, + "grad_norm": 0.8408716917037964, + "learning_rate": 5.852991799129291e-05, + "loss": 0.6088, + "step": 8202 + }, + { + "epoch": 2.489907421460009, + "grad_norm": 0.4795108735561371, + "learning_rate": 5.85248557254227e-05, + "loss": 1.3906, + "step": 8203 + }, + { + "epoch": 2.4902109576567004, + "grad_norm": 0.6167114973068237, + "learning_rate": 5.85197934595525e-05, + "loss": 1.2835, + "step": 8204 + }, + { + "epoch": 2.490514493853392, + "grad_norm": 0.8909234404563904, + "learning_rate": 5.851473119368229e-05, + "loss": 1.0097, + "step": 8205 + }, + { + "epoch": 2.4908180300500833, + "grad_norm": 0.8499066233634949, + "learning_rate": 5.8509668927812086e-05, + "loss": 1.3819, + "step": 8206 + }, + { + "epoch": 2.491121566246775, + "grad_norm": 0.8384965062141418, + "learning_rate": 5.850460666194189e-05, + "loss": 1.4173, + "step": 8207 + }, + { + "epoch": 2.4914251024434666, + "grad_norm": 0.7784766554832458, + "learning_rate": 5.849954439607168e-05, + "loss": 1.1626, + "step": 8208 + }, + { + "epoch": 2.491728638640158, + "grad_norm": 0.9458047747612, + "learning_rate": 5.849448213020148e-05, + "loss": 0.8844, + "step": 8209 + }, + { + "epoch": 2.4920321748368495, + "grad_norm": 0.8930952548980713, + "learning_rate": 5.848941986433127e-05, + "loss": 1.1677, + "step": 8210 + }, + { + "epoch": 2.4923357110335407, + "grad_norm": 1.381670594215393, + "learning_rate": 5.848435759846107e-05, + "loss": 0.5978, + "step": 8211 + }, + { + "epoch": 2.4926392472302323, + "grad_norm": 0.8100841045379639, + "learning_rate": 5.847929533259087e-05, + "loss": 1.0791, + "step": 8212 + }, + { + "epoch": 2.4929427834269235, + "grad_norm": 1.0851428508758545, + "learning_rate": 5.8474233066720664e-05, + "loss": 1.2838, + "step": 8213 + }, + { + "epoch": 2.493246319623615, + "grad_norm": 0.6545624732971191, + "learning_rate": 5.846917080085046e-05, + "loss": 1.1591, + "step": 8214 + }, + { + "epoch": 2.4935498558203064, + "grad_norm": 0.6443068385124207, + "learning_rate": 5.846410853498025e-05, + "loss": 1.056, + "step": 8215 + }, + { + "epoch": 2.493853392016998, + "grad_norm": 0.8362240791320801, + "learning_rate": 5.8459046269110055e-05, + "loss": 1.011, + "step": 8216 + }, + { + "epoch": 2.4941569282136893, + "grad_norm": 0.7914416193962097, + "learning_rate": 5.845398400323985e-05, + "loss": 1.3633, + "step": 8217 + }, + { + "epoch": 2.494460464410381, + "grad_norm": 0.6533204317092896, + "learning_rate": 5.8448921737369644e-05, + "loss": 1.4868, + "step": 8218 + }, + { + "epoch": 2.4947640006070726, + "grad_norm": 0.8610125184059143, + "learning_rate": 5.844385947149945e-05, + "loss": 1.2977, + "step": 8219 + }, + { + "epoch": 2.495067536803764, + "grad_norm": 0.7882906198501587, + "learning_rate": 5.843879720562925e-05, + "loss": 1.2888, + "step": 8220 + }, + { + "epoch": 2.4953710730004555, + "grad_norm": 0.6866266131401062, + "learning_rate": 5.843373493975904e-05, + "loss": 1.372, + "step": 8221 + }, + { + "epoch": 2.4956746091971467, + "grad_norm": 0.7847753763198853, + "learning_rate": 5.842867267388884e-05, + "loss": 1.4631, + "step": 8222 + }, + { + "epoch": 2.4959781453938383, + "grad_norm": 0.7031978964805603, + "learning_rate": 5.842361040801864e-05, + "loss": 1.3046, + "step": 8223 + }, + { + "epoch": 2.4962816815905295, + "grad_norm": 0.8611195087432861, + "learning_rate": 5.841854814214843e-05, + "loss": 1.0336, + "step": 8224 + }, + { + "epoch": 2.496585217787221, + "grad_norm": 0.7341358065605164, + "learning_rate": 5.841348587627823e-05, + "loss": 1.4171, + "step": 8225 + }, + { + "epoch": 2.4968887539839124, + "grad_norm": 0.7016193866729736, + "learning_rate": 5.840842361040802e-05, + "loss": 1.0846, + "step": 8226 + }, + { + "epoch": 2.497192290180604, + "grad_norm": 0.6806894540786743, + "learning_rate": 5.8403361344537825e-05, + "loss": 1.5947, + "step": 8227 + }, + { + "epoch": 2.4974958263772953, + "grad_norm": 0.7414386868476868, + "learning_rate": 5.839829907866762e-05, + "loss": 1.4735, + "step": 8228 + }, + { + "epoch": 2.497799362573987, + "grad_norm": 0.7096249461174011, + "learning_rate": 5.8393236812797414e-05, + "loss": 1.6038, + "step": 8229 + }, + { + "epoch": 2.4981028987706786, + "grad_norm": 0.6129046678543091, + "learning_rate": 5.838817454692721e-05, + "loss": 1.366, + "step": 8230 + }, + { + "epoch": 2.49840643496737, + "grad_norm": 0.6488550305366516, + "learning_rate": 5.8383112281057004e-05, + "loss": 1.1149, + "step": 8231 + }, + { + "epoch": 2.4987099711640615, + "grad_norm": 0.7201668620109558, + "learning_rate": 5.8378050015186805e-05, + "loss": 1.4682, + "step": 8232 + }, + { + "epoch": 2.4990135073607527, + "grad_norm": 0.6897460222244263, + "learning_rate": 5.83729877493166e-05, + "loss": 1.5793, + "step": 8233 + }, + { + "epoch": 2.4993170435574443, + "grad_norm": 0.7777083516120911, + "learning_rate": 5.8367925483446395e-05, + "loss": 1.2701, + "step": 8234 + }, + { + "epoch": 2.4996205797541355, + "grad_norm": 0.7976280450820923, + "learning_rate": 5.836286321757619e-05, + "loss": 1.3406, + "step": 8235 + }, + { + "epoch": 2.499924115950827, + "grad_norm": 0.9655071496963501, + "learning_rate": 5.8357800951705984e-05, + "loss": 1.47, + "step": 8236 + }, + { + "epoch": 2.500227652147519, + "grad_norm": 1.0588605403900146, + "learning_rate": 5.8352738685835786e-05, + "loss": 1.6478, + "step": 8237 + }, + { + "epoch": 2.50053118834421, + "grad_norm": 0.8263594508171082, + "learning_rate": 5.834767641996558e-05, + "loss": 1.2701, + "step": 8238 + }, + { + "epoch": 2.5008347245409013, + "grad_norm": 0.8433352112770081, + "learning_rate": 5.8342614154095376e-05, + "loss": 1.1952, + "step": 8239 + }, + { + "epoch": 2.501138260737593, + "grad_norm": 0.8610383868217468, + "learning_rate": 5.833755188822517e-05, + "loss": 1.2973, + "step": 8240 + }, + { + "epoch": 2.5014417969342846, + "grad_norm": 0.8831159472465515, + "learning_rate": 5.833248962235497e-05, + "loss": 1.2945, + "step": 8241 + }, + { + "epoch": 2.501745333130976, + "grad_norm": 0.8238030672073364, + "learning_rate": 5.832742735648477e-05, + "loss": 1.0743, + "step": 8242 + }, + { + "epoch": 2.5020488693276675, + "grad_norm": 0.694869875907898, + "learning_rate": 5.832236509061456e-05, + "loss": 1.284, + "step": 8243 + }, + { + "epoch": 2.5023524055243587, + "grad_norm": 0.8804123997688293, + "learning_rate": 5.8317302824744356e-05, + "loss": 1.2777, + "step": 8244 + }, + { + "epoch": 2.5026559417210503, + "grad_norm": 0.849064290523529, + "learning_rate": 5.831224055887415e-05, + "loss": 1.3544, + "step": 8245 + }, + { + "epoch": 2.5029594779177415, + "grad_norm": 0.8048885464668274, + "learning_rate": 5.830717829300395e-05, + "loss": 1.4033, + "step": 8246 + }, + { + "epoch": 2.503263014114433, + "grad_norm": 1.1192904710769653, + "learning_rate": 5.830211602713375e-05, + "loss": 1.2505, + "step": 8247 + }, + { + "epoch": 2.503566550311125, + "grad_norm": 0.7913041114807129, + "learning_rate": 5.829705376126354e-05, + "loss": 1.8518, + "step": 8248 + }, + { + "epoch": 2.503870086507816, + "grad_norm": 0.8328375220298767, + "learning_rate": 5.829199149539334e-05, + "loss": 1.0964, + "step": 8249 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.7091400027275085, + "learning_rate": 5.828692922952313e-05, + "loss": 1.4837, + "step": 8250 + }, + { + "epoch": 2.504477158901199, + "grad_norm": 0.7294608354568481, + "learning_rate": 5.8281866963652933e-05, + "loss": 1.229, + "step": 8251 + }, + { + "epoch": 2.5047806950978906, + "grad_norm": 0.6839693784713745, + "learning_rate": 5.827680469778273e-05, + "loss": 0.6826, + "step": 8252 + }, + { + "epoch": 2.505084231294582, + "grad_norm": 0.7262798547744751, + "learning_rate": 5.827174243191252e-05, + "loss": 1.535, + "step": 8253 + }, + { + "epoch": 2.5053877674912735, + "grad_norm": 0.643129289150238, + "learning_rate": 5.826668016604232e-05, + "loss": 1.3863, + "step": 8254 + }, + { + "epoch": 2.5056913036879647, + "grad_norm": 0.6908575296401978, + "learning_rate": 5.826161790017212e-05, + "loss": 1.3708, + "step": 8255 + }, + { + "epoch": 2.5059948398846563, + "grad_norm": 0.7086175084114075, + "learning_rate": 5.8256555634301914e-05, + "loss": 1.5198, + "step": 8256 + }, + { + "epoch": 2.5062983760813475, + "grad_norm": 0.8000795245170593, + "learning_rate": 5.825149336843171e-05, + "loss": 1.2737, + "step": 8257 + }, + { + "epoch": 2.506601912278039, + "grad_norm": 0.7499237656593323, + "learning_rate": 5.8246431102561504e-05, + "loss": 1.803, + "step": 8258 + }, + { + "epoch": 2.506905448474731, + "grad_norm": 0.7838631868362427, + "learning_rate": 5.82413688366913e-05, + "loss": 1.3923, + "step": 8259 + }, + { + "epoch": 2.507208984671422, + "grad_norm": 0.8873981833457947, + "learning_rate": 5.82363065708211e-05, + "loss": 1.3988, + "step": 8260 + }, + { + "epoch": 2.5075125208681133, + "grad_norm": 0.6524555087089539, + "learning_rate": 5.8231244304950895e-05, + "loss": 1.5845, + "step": 8261 + }, + { + "epoch": 2.507816057064805, + "grad_norm": 0.6309491991996765, + "learning_rate": 5.822618203908069e-05, + "loss": 1.6408, + "step": 8262 + }, + { + "epoch": 2.5081195932614966, + "grad_norm": 0.745837926864624, + "learning_rate": 5.8221119773210485e-05, + "loss": 1.3983, + "step": 8263 + }, + { + "epoch": 2.508423129458188, + "grad_norm": 0.54892897605896, + "learning_rate": 5.821605750734029e-05, + "loss": 1.3872, + "step": 8264 + }, + { + "epoch": 2.5087266656548795, + "grad_norm": 0.8010538816452026, + "learning_rate": 5.821099524147009e-05, + "loss": 0.8688, + "step": 8265 + }, + { + "epoch": 2.5090302018515707, + "grad_norm": 0.7392093539237976, + "learning_rate": 5.820593297559989e-05, + "loss": 1.5812, + "step": 8266 + }, + { + "epoch": 2.5093337380482623, + "grad_norm": 0.8679747581481934, + "learning_rate": 5.8200870709729684e-05, + "loss": 0.9848, + "step": 8267 + }, + { + "epoch": 2.5096372742449535, + "grad_norm": 0.66532963514328, + "learning_rate": 5.819580844385948e-05, + "loss": 1.3337, + "step": 8268 + }, + { + "epoch": 2.509940810441645, + "grad_norm": 0.7652869820594788, + "learning_rate": 5.8190746177989274e-05, + "loss": 1.3081, + "step": 8269 + }, + { + "epoch": 2.510244346638337, + "grad_norm": 0.7931615114212036, + "learning_rate": 5.818568391211907e-05, + "loss": 1.3818, + "step": 8270 + }, + { + "epoch": 2.510547882835028, + "grad_norm": 0.7917905449867249, + "learning_rate": 5.818062164624887e-05, + "loss": 1.4077, + "step": 8271 + }, + { + "epoch": 2.5108514190317193, + "grad_norm": 1.0687652826309204, + "learning_rate": 5.8175559380378665e-05, + "loss": 1.1502, + "step": 8272 + }, + { + "epoch": 2.511154955228411, + "grad_norm": 0.5490748882293701, + "learning_rate": 5.817049711450846e-05, + "loss": 1.8143, + "step": 8273 + }, + { + "epoch": 2.5114584914251026, + "grad_norm": 0.549918532371521, + "learning_rate": 5.8165434848638254e-05, + "loss": 1.0906, + "step": 8274 + }, + { + "epoch": 2.511762027621794, + "grad_norm": 0.7719300389289856, + "learning_rate": 5.816037258276805e-05, + "loss": 1.5224, + "step": 8275 + }, + { + "epoch": 2.5120655638184854, + "grad_norm": 0.9084600210189819, + "learning_rate": 5.815531031689785e-05, + "loss": 1.5819, + "step": 8276 + }, + { + "epoch": 2.5123691000151767, + "grad_norm": 0.7480749487876892, + "learning_rate": 5.8150248051027646e-05, + "loss": 0.5482, + "step": 8277 + }, + { + "epoch": 2.5126726362118683, + "grad_norm": 0.9660145044326782, + "learning_rate": 5.814518578515744e-05, + "loss": 1.2091, + "step": 8278 + }, + { + "epoch": 2.5129761724085595, + "grad_norm": 0.7917237877845764, + "learning_rate": 5.8140123519287235e-05, + "loss": 1.2719, + "step": 8279 + }, + { + "epoch": 2.513279708605251, + "grad_norm": 0.6512066125869751, + "learning_rate": 5.813506125341704e-05, + "loss": 1.2307, + "step": 8280 + }, + { + "epoch": 2.513583244801943, + "grad_norm": 0.9117663502693176, + "learning_rate": 5.812999898754683e-05, + "loss": 1.0733, + "step": 8281 + }, + { + "epoch": 2.513886780998634, + "grad_norm": 0.7804828882217407, + "learning_rate": 5.8124936721676626e-05, + "loss": 1.2834, + "step": 8282 + }, + { + "epoch": 2.5141903171953257, + "grad_norm": 0.6926867365837097, + "learning_rate": 5.811987445580642e-05, + "loss": 1.0372, + "step": 8283 + }, + { + "epoch": 2.514493853392017, + "grad_norm": 0.7769339680671692, + "learning_rate": 5.8114812189936216e-05, + "loss": 1.3893, + "step": 8284 + }, + { + "epoch": 2.5147973895887086, + "grad_norm": 0.8659480810165405, + "learning_rate": 5.810974992406602e-05, + "loss": 1.3224, + "step": 8285 + }, + { + "epoch": 2.5151009257854, + "grad_norm": 0.8789883852005005, + "learning_rate": 5.810468765819581e-05, + "loss": 1.0516, + "step": 8286 + }, + { + "epoch": 2.5154044619820914, + "grad_norm": 0.7343612909317017, + "learning_rate": 5.809962539232561e-05, + "loss": 1.4813, + "step": 8287 + }, + { + "epoch": 2.5157079981787827, + "grad_norm": 0.7558401226997375, + "learning_rate": 5.80945631264554e-05, + "loss": 1.5583, + "step": 8288 + }, + { + "epoch": 2.5160115343754743, + "grad_norm": 0.5077150464057922, + "learning_rate": 5.80895008605852e-05, + "loss": 1.7516, + "step": 8289 + }, + { + "epoch": 2.5163150705721655, + "grad_norm": 0.8321554660797119, + "learning_rate": 5.8084438594715e-05, + "loss": 1.2849, + "step": 8290 + }, + { + "epoch": 2.516618606768857, + "grad_norm": 0.7427679896354675, + "learning_rate": 5.807937632884479e-05, + "loss": 1.2341, + "step": 8291 + }, + { + "epoch": 2.516922142965549, + "grad_norm": 0.7994052767753601, + "learning_rate": 5.807431406297459e-05, + "loss": 1.3661, + "step": 8292 + }, + { + "epoch": 2.51722567916224, + "grad_norm": 0.8299456834793091, + "learning_rate": 5.806925179710438e-05, + "loss": 1.0751, + "step": 8293 + }, + { + "epoch": 2.5175292153589317, + "grad_norm": 1.3051421642303467, + "learning_rate": 5.8064189531234184e-05, + "loss": 1.1627, + "step": 8294 + }, + { + "epoch": 2.517832751555623, + "grad_norm": 0.8921769857406616, + "learning_rate": 5.805912726536398e-05, + "loss": 1.3073, + "step": 8295 + }, + { + "epoch": 2.5181362877523146, + "grad_norm": 0.78340744972229, + "learning_rate": 5.8054064999493774e-05, + "loss": 1.7102, + "step": 8296 + }, + { + "epoch": 2.518439823949006, + "grad_norm": 0.6377023458480835, + "learning_rate": 5.804900273362357e-05, + "loss": 1.3017, + "step": 8297 + }, + { + "epoch": 2.5187433601456974, + "grad_norm": 0.8279851675033569, + "learning_rate": 5.804394046775336e-05, + "loss": 1.3353, + "step": 8298 + }, + { + "epoch": 2.519046896342389, + "grad_norm": 0.9127827286720276, + "learning_rate": 5.8038878201883165e-05, + "loss": 1.5184, + "step": 8299 + }, + { + "epoch": 2.5193504325390803, + "grad_norm": 0.6756750345230103, + "learning_rate": 5.803381593601296e-05, + "loss": 1.2604, + "step": 8300 + }, + { + "epoch": 2.5196539687357715, + "grad_norm": 0.6783917546272278, + "learning_rate": 5.8028753670142755e-05, + "loss": 0.9262, + "step": 8301 + }, + { + "epoch": 2.519957504932463, + "grad_norm": 0.7022363543510437, + "learning_rate": 5.802369140427255e-05, + "loss": 1.4863, + "step": 8302 + }, + { + "epoch": 2.520261041129155, + "grad_norm": 0.6683651804924011, + "learning_rate": 5.8018629138402344e-05, + "loss": 1.3591, + "step": 8303 + }, + { + "epoch": 2.520564577325846, + "grad_norm": 0.7177444100379944, + "learning_rate": 5.8013566872532146e-05, + "loss": 1.6204, + "step": 8304 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.7411631941795349, + "learning_rate": 5.800850460666194e-05, + "loss": 1.4532, + "step": 8305 + }, + { + "epoch": 2.521171649719229, + "grad_norm": 1.0475449562072754, + "learning_rate": 5.8003442340791735e-05, + "loss": 1.3259, + "step": 8306 + }, + { + "epoch": 2.5214751859159206, + "grad_norm": 0.7804305553436279, + "learning_rate": 5.799838007492153e-05, + "loss": 1.4362, + "step": 8307 + }, + { + "epoch": 2.521778722112612, + "grad_norm": 0.6839709877967834, + "learning_rate": 5.799331780905134e-05, + "loss": 1.288, + "step": 8308 + }, + { + "epoch": 2.5220822583093034, + "grad_norm": 1.0387964248657227, + "learning_rate": 5.798825554318113e-05, + "loss": 1.1246, + "step": 8309 + }, + { + "epoch": 2.522385794505995, + "grad_norm": 0.7365851998329163, + "learning_rate": 5.7983193277310935e-05, + "loss": 1.3858, + "step": 8310 + }, + { + "epoch": 2.5226893307026863, + "grad_norm": 0.9452034831047058, + "learning_rate": 5.797813101144073e-05, + "loss": 1.2232, + "step": 8311 + }, + { + "epoch": 2.5229928668993775, + "grad_norm": 0.8562889099121094, + "learning_rate": 5.7973068745570524e-05, + "loss": 1.1791, + "step": 8312 + }, + { + "epoch": 2.523296403096069, + "grad_norm": 0.7401477694511414, + "learning_rate": 5.796800647970032e-05, + "loss": 1.4497, + "step": 8313 + }, + { + "epoch": 2.523599939292761, + "grad_norm": 0.8923379778862, + "learning_rate": 5.7962944213830114e-05, + "loss": 1.4017, + "step": 8314 + }, + { + "epoch": 2.523903475489452, + "grad_norm": 0.8456719517707825, + "learning_rate": 5.7957881947959916e-05, + "loss": 0.9548, + "step": 8315 + }, + { + "epoch": 2.5242070116861437, + "grad_norm": 0.7915187478065491, + "learning_rate": 5.795281968208971e-05, + "loss": 1.3552, + "step": 8316 + }, + { + "epoch": 2.524510547882835, + "grad_norm": 0.8779439926147461, + "learning_rate": 5.7947757416219505e-05, + "loss": 1.4899, + "step": 8317 + }, + { + "epoch": 2.5248140840795266, + "grad_norm": 0.8715509176254272, + "learning_rate": 5.79426951503493e-05, + "loss": 1.3182, + "step": 8318 + }, + { + "epoch": 2.525117620276218, + "grad_norm": 0.7651680111885071, + "learning_rate": 5.79376328844791e-05, + "loss": 1.6216, + "step": 8319 + }, + { + "epoch": 2.5254211564729094, + "grad_norm": 0.7060753703117371, + "learning_rate": 5.7932570618608896e-05, + "loss": 1.4024, + "step": 8320 + }, + { + "epoch": 2.525724692669601, + "grad_norm": 0.9709405303001404, + "learning_rate": 5.792750835273869e-05, + "loss": 1.3222, + "step": 8321 + }, + { + "epoch": 2.5260282288662923, + "grad_norm": 0.6587694883346558, + "learning_rate": 5.7922446086868486e-05, + "loss": 1.5824, + "step": 8322 + }, + { + "epoch": 2.5263317650629835, + "grad_norm": 0.6169544458389282, + "learning_rate": 5.791738382099828e-05, + "loss": 1.1027, + "step": 8323 + }, + { + "epoch": 2.526635301259675, + "grad_norm": 0.7203264236450195, + "learning_rate": 5.791232155512808e-05, + "loss": 1.1206, + "step": 8324 + }, + { + "epoch": 2.526938837456367, + "grad_norm": 0.7035651206970215, + "learning_rate": 5.790725928925788e-05, + "loss": 1.2364, + "step": 8325 + }, + { + "epoch": 2.527242373653058, + "grad_norm": 0.7010215520858765, + "learning_rate": 5.790219702338767e-05, + "loss": 1.0687, + "step": 8326 + }, + { + "epoch": 2.5275459098497497, + "grad_norm": 0.8027076125144958, + "learning_rate": 5.789713475751747e-05, + "loss": 1.2478, + "step": 8327 + }, + { + "epoch": 2.527849446046441, + "grad_norm": 0.776627779006958, + "learning_rate": 5.789207249164726e-05, + "loss": 1.4251, + "step": 8328 + }, + { + "epoch": 2.5281529822431326, + "grad_norm": 0.7934873104095459, + "learning_rate": 5.788701022577706e-05, + "loss": 1.5188, + "step": 8329 + }, + { + "epoch": 2.5284565184398238, + "grad_norm": 0.6677809953689575, + "learning_rate": 5.788194795990686e-05, + "loss": 1.2741, + "step": 8330 + }, + { + "epoch": 2.5287600546365154, + "grad_norm": 0.8031719923019409, + "learning_rate": 5.787688569403665e-05, + "loss": 1.3056, + "step": 8331 + }, + { + "epoch": 2.529063590833207, + "grad_norm": 0.616132915019989, + "learning_rate": 5.787182342816645e-05, + "loss": 1.4796, + "step": 8332 + }, + { + "epoch": 2.5293671270298983, + "grad_norm": 0.7809193134307861, + "learning_rate": 5.786676116229625e-05, + "loss": 1.2991, + "step": 8333 + }, + { + "epoch": 2.5296706632265895, + "grad_norm": 0.8768143653869629, + "learning_rate": 5.7861698896426044e-05, + "loss": 1.3272, + "step": 8334 + }, + { + "epoch": 2.529974199423281, + "grad_norm": 0.9282557368278503, + "learning_rate": 5.785663663055584e-05, + "loss": 1.372, + "step": 8335 + }, + { + "epoch": 2.530277735619973, + "grad_norm": 0.7703008055686951, + "learning_rate": 5.785157436468563e-05, + "loss": 1.9085, + "step": 8336 + }, + { + "epoch": 2.530581271816664, + "grad_norm": 0.8404890298843384, + "learning_rate": 5.784651209881543e-05, + "loss": 1.0276, + "step": 8337 + }, + { + "epoch": 2.5308848080133557, + "grad_norm": 0.7915540933609009, + "learning_rate": 5.784144983294523e-05, + "loss": 1.5161, + "step": 8338 + }, + { + "epoch": 2.531188344210047, + "grad_norm": 0.9564916491508484, + "learning_rate": 5.7836387567075024e-05, + "loss": 1.1961, + "step": 8339 + }, + { + "epoch": 2.5314918804067386, + "grad_norm": 0.8775029182434082, + "learning_rate": 5.783132530120482e-05, + "loss": 1.4621, + "step": 8340 + }, + { + "epoch": 2.5317954166034298, + "grad_norm": 0.8440585136413574, + "learning_rate": 5.7826263035334614e-05, + "loss": 0.8856, + "step": 8341 + }, + { + "epoch": 2.5320989528001214, + "grad_norm": 0.8222201466560364, + "learning_rate": 5.782120076946441e-05, + "loss": 1.0385, + "step": 8342 + }, + { + "epoch": 2.532402488996813, + "grad_norm": 0.7944109439849854, + "learning_rate": 5.781613850359421e-05, + "loss": 1.2259, + "step": 8343 + }, + { + "epoch": 2.5327060251935043, + "grad_norm": 0.6671246886253357, + "learning_rate": 5.7811076237724005e-05, + "loss": 1.1106, + "step": 8344 + }, + { + "epoch": 2.5330095613901955, + "grad_norm": 0.7811467051506042, + "learning_rate": 5.78060139718538e-05, + "loss": 1.2432, + "step": 8345 + }, + { + "epoch": 2.533313097586887, + "grad_norm": 0.8934546113014221, + "learning_rate": 5.7800951705983595e-05, + "loss": 1.2128, + "step": 8346 + }, + { + "epoch": 2.533616633783579, + "grad_norm": 0.7736660838127136, + "learning_rate": 5.7795889440113396e-05, + "loss": 1.5971, + "step": 8347 + }, + { + "epoch": 2.53392016998027, + "grad_norm": 0.8068675398826599, + "learning_rate": 5.779082717424319e-05, + "loss": 0.9002, + "step": 8348 + }, + { + "epoch": 2.5342237061769617, + "grad_norm": 0.7460306882858276, + "learning_rate": 5.7785764908372986e-05, + "loss": 1.0504, + "step": 8349 + }, + { + "epoch": 2.534527242373653, + "grad_norm": 0.8296782970428467, + "learning_rate": 5.778070264250278e-05, + "loss": 1.4817, + "step": 8350 + }, + { + "epoch": 2.5348307785703446, + "grad_norm": 1.005679965019226, + "learning_rate": 5.7775640376632576e-05, + "loss": 0.9825, + "step": 8351 + }, + { + "epoch": 2.5351343147670358, + "grad_norm": 0.7980281710624695, + "learning_rate": 5.777057811076238e-05, + "loss": 1.3191, + "step": 8352 + }, + { + "epoch": 2.5354378509637274, + "grad_norm": 0.7153705954551697, + "learning_rate": 5.776551584489218e-05, + "loss": 1.2975, + "step": 8353 + }, + { + "epoch": 2.535741387160419, + "grad_norm": 0.8253535628318787, + "learning_rate": 5.776045357902198e-05, + "loss": 1.5653, + "step": 8354 + }, + { + "epoch": 2.5360449233571103, + "grad_norm": 0.7665138244628906, + "learning_rate": 5.7755391313151775e-05, + "loss": 1.4909, + "step": 8355 + }, + { + "epoch": 2.536348459553802, + "grad_norm": 0.9321874380111694, + "learning_rate": 5.775032904728157e-05, + "loss": 0.7591, + "step": 8356 + }, + { + "epoch": 2.536651995750493, + "grad_norm": 0.726944625377655, + "learning_rate": 5.7745266781411365e-05, + "loss": 1.3542, + "step": 8357 + }, + { + "epoch": 2.536955531947185, + "grad_norm": 0.8286332488059998, + "learning_rate": 5.7740204515541166e-05, + "loss": 1.7159, + "step": 8358 + }, + { + "epoch": 2.537259068143876, + "grad_norm": 0.8257943987846375, + "learning_rate": 5.773514224967096e-05, + "loss": 0.7839, + "step": 8359 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.7034426331520081, + "learning_rate": 5.7730079983800756e-05, + "loss": 1.2258, + "step": 8360 + }, + { + "epoch": 2.5378661405372593, + "grad_norm": 0.70139080286026, + "learning_rate": 5.772501771793055e-05, + "loss": 1.3429, + "step": 8361 + }, + { + "epoch": 2.5381696767339506, + "grad_norm": 0.8869861364364624, + "learning_rate": 5.7719955452060345e-05, + "loss": 1.3633, + "step": 8362 + }, + { + "epoch": 2.5384732129306418, + "grad_norm": 0.7827722430229187, + "learning_rate": 5.771489318619015e-05, + "loss": 1.4197, + "step": 8363 + }, + { + "epoch": 2.5387767491273334, + "grad_norm": 0.8089902997016907, + "learning_rate": 5.770983092031994e-05, + "loss": 1.4385, + "step": 8364 + }, + { + "epoch": 2.539080285324025, + "grad_norm": 0.7806230187416077, + "learning_rate": 5.7704768654449737e-05, + "loss": 1.4791, + "step": 8365 + }, + { + "epoch": 2.5393838215207163, + "grad_norm": 1.0129203796386719, + "learning_rate": 5.769970638857953e-05, + "loss": 1.2102, + "step": 8366 + }, + { + "epoch": 2.539687357717408, + "grad_norm": 0.7767428755760193, + "learning_rate": 5.7694644122709326e-05, + "loss": 1.4489, + "step": 8367 + }, + { + "epoch": 2.539990893914099, + "grad_norm": 0.6615005135536194, + "learning_rate": 5.768958185683913e-05, + "loss": 1.2321, + "step": 8368 + }, + { + "epoch": 2.540294430110791, + "grad_norm": 0.9702263474464417, + "learning_rate": 5.768451959096892e-05, + "loss": 1.4452, + "step": 8369 + }, + { + "epoch": 2.540597966307482, + "grad_norm": 1.022396206855774, + "learning_rate": 5.767945732509872e-05, + "loss": 0.7356, + "step": 8370 + }, + { + "epoch": 2.5409015025041737, + "grad_norm": 0.7501503229141235, + "learning_rate": 5.767439505922851e-05, + "loss": 1.1511, + "step": 8371 + }, + { + "epoch": 2.5412050387008653, + "grad_norm": 0.8175548315048218, + "learning_rate": 5.7669332793358314e-05, + "loss": 1.37, + "step": 8372 + }, + { + "epoch": 2.5415085748975565, + "grad_norm": 0.7536338567733765, + "learning_rate": 5.766427052748811e-05, + "loss": 1.2864, + "step": 8373 + }, + { + "epoch": 2.5418121110942478, + "grad_norm": 0.8838115334510803, + "learning_rate": 5.76592082616179e-05, + "loss": 1.348, + "step": 8374 + }, + { + "epoch": 2.5421156472909394, + "grad_norm": 0.7550739049911499, + "learning_rate": 5.76541459957477e-05, + "loss": 1.4292, + "step": 8375 + }, + { + "epoch": 2.542419183487631, + "grad_norm": 0.8385372757911682, + "learning_rate": 5.764908372987749e-05, + "loss": 1.6506, + "step": 8376 + }, + { + "epoch": 2.5427227196843223, + "grad_norm": 0.6910164952278137, + "learning_rate": 5.7644021464007294e-05, + "loss": 0.8724, + "step": 8377 + }, + { + "epoch": 2.543026255881014, + "grad_norm": 0.7734633088111877, + "learning_rate": 5.763895919813709e-05, + "loss": 0.2027, + "step": 8378 + }, + { + "epoch": 2.543329792077705, + "grad_norm": 0.6335809826850891, + "learning_rate": 5.7633896932266884e-05, + "loss": 0.9734, + "step": 8379 + }, + { + "epoch": 2.543633328274397, + "grad_norm": 0.7772993445396423, + "learning_rate": 5.762883466639668e-05, + "loss": 1.3989, + "step": 8380 + }, + { + "epoch": 2.543936864471088, + "grad_norm": 0.8541886806488037, + "learning_rate": 5.7623772400526474e-05, + "loss": 1.4976, + "step": 8381 + }, + { + "epoch": 2.5442404006677797, + "grad_norm": 0.8611499071121216, + "learning_rate": 5.7618710134656275e-05, + "loss": 1.2922, + "step": 8382 + }, + { + "epoch": 2.5445439368644713, + "grad_norm": 0.8912010192871094, + "learning_rate": 5.761364786878607e-05, + "loss": 1.3701, + "step": 8383 + }, + { + "epoch": 2.5448474730611625, + "grad_norm": 0.6431745886802673, + "learning_rate": 5.7608585602915865e-05, + "loss": 1.5163, + "step": 8384 + }, + { + "epoch": 2.5451510092578538, + "grad_norm": 0.7857423424720764, + "learning_rate": 5.760352333704566e-05, + "loss": 1.5629, + "step": 8385 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.8442662358283997, + "learning_rate": 5.759846107117546e-05, + "loss": 0.6841, + "step": 8386 + }, + { + "epoch": 2.545758081651237, + "grad_norm": 0.7257554531097412, + "learning_rate": 5.7593398805305256e-05, + "loss": 1.1639, + "step": 8387 + }, + { + "epoch": 2.5460616178479283, + "grad_norm": 0.693900465965271, + "learning_rate": 5.758833653943505e-05, + "loss": 0.9785, + "step": 8388 + }, + { + "epoch": 2.54636515404462, + "grad_norm": 0.45039594173431396, + "learning_rate": 5.7583274273564846e-05, + "loss": 0.7857, + "step": 8389 + }, + { + "epoch": 2.546668690241311, + "grad_norm": 0.8388344049453735, + "learning_rate": 5.757821200769464e-05, + "loss": 1.2768, + "step": 8390 + }, + { + "epoch": 2.546972226438003, + "grad_norm": 0.7791977524757385, + "learning_rate": 5.757314974182444e-05, + "loss": 1.3924, + "step": 8391 + }, + { + "epoch": 2.547275762634694, + "grad_norm": 0.760310173034668, + "learning_rate": 5.756808747595424e-05, + "loss": 1.6505, + "step": 8392 + }, + { + "epoch": 2.5475792988313857, + "grad_norm": 0.7585695385932922, + "learning_rate": 5.756302521008403e-05, + "loss": 1.4748, + "step": 8393 + }, + { + "epoch": 2.5478828350280773, + "grad_norm": 0.8367417454719543, + "learning_rate": 5.7557962944213826e-05, + "loss": 1.3173, + "step": 8394 + }, + { + "epoch": 2.5481863712247685, + "grad_norm": 0.81987065076828, + "learning_rate": 5.755290067834362e-05, + "loss": 1.1409, + "step": 8395 + }, + { + "epoch": 2.5484899074214598, + "grad_norm": 0.6271147131919861, + "learning_rate": 5.754783841247342e-05, + "loss": 1.169, + "step": 8396 + }, + { + "epoch": 2.5487934436181514, + "grad_norm": 0.7545047402381897, + "learning_rate": 5.754277614660323e-05, + "loss": 1.3987, + "step": 8397 + }, + { + "epoch": 2.549096979814843, + "grad_norm": 0.791571319103241, + "learning_rate": 5.7537713880733026e-05, + "loss": 1.5415, + "step": 8398 + }, + { + "epoch": 2.5494005160115343, + "grad_norm": 0.6578336358070374, + "learning_rate": 5.753265161486282e-05, + "loss": 1.4396, + "step": 8399 + }, + { + "epoch": 2.549704052208226, + "grad_norm": 0.70784592628479, + "learning_rate": 5.7527589348992615e-05, + "loss": 1.3632, + "step": 8400 + }, + { + "epoch": 2.550007588404917, + "grad_norm": 0.8875760436058044, + "learning_rate": 5.752252708312241e-05, + "loss": 1.4161, + "step": 8401 + }, + { + "epoch": 2.550311124601609, + "grad_norm": 0.6619694828987122, + "learning_rate": 5.751746481725221e-05, + "loss": 1.5841, + "step": 8402 + }, + { + "epoch": 2.5506146607983, + "grad_norm": 0.6046818494796753, + "learning_rate": 5.7512402551382007e-05, + "loss": 0.9539, + "step": 8403 + }, + { + "epoch": 2.5509181969949917, + "grad_norm": 0.7589368224143982, + "learning_rate": 5.75073402855118e-05, + "loss": 1.5581, + "step": 8404 + }, + { + "epoch": 2.5512217331916833, + "grad_norm": 0.9756147265434265, + "learning_rate": 5.7502278019641596e-05, + "loss": 1.1617, + "step": 8405 + }, + { + "epoch": 2.5515252693883745, + "grad_norm": 1.0096057653427124, + "learning_rate": 5.749721575377139e-05, + "loss": 1.345, + "step": 8406 + }, + { + "epoch": 2.5518288055850658, + "grad_norm": 0.7118128538131714, + "learning_rate": 5.749215348790119e-05, + "loss": 1.2063, + "step": 8407 + }, + { + "epoch": 2.5521323417817574, + "grad_norm": 0.668727695941925, + "learning_rate": 5.748709122203099e-05, + "loss": 0.9131, + "step": 8408 + }, + { + "epoch": 2.552435877978449, + "grad_norm": 0.6993553042411804, + "learning_rate": 5.748202895616078e-05, + "loss": 1.0686, + "step": 8409 + }, + { + "epoch": 2.5527394141751403, + "grad_norm": 0.6418778300285339, + "learning_rate": 5.747696669029058e-05, + "loss": 1.2229, + "step": 8410 + }, + { + "epoch": 2.553042950371832, + "grad_norm": 0.8472557067871094, + "learning_rate": 5.747190442442038e-05, + "loss": 1.0367, + "step": 8411 + }, + { + "epoch": 2.553346486568523, + "grad_norm": 0.8465378880500793, + "learning_rate": 5.746684215855017e-05, + "loss": 1.3367, + "step": 8412 + }, + { + "epoch": 2.553650022765215, + "grad_norm": 0.9106330275535583, + "learning_rate": 5.746177989267997e-05, + "loss": 1.4787, + "step": 8413 + }, + { + "epoch": 2.553953558961906, + "grad_norm": 0.7567964196205139, + "learning_rate": 5.745671762680976e-05, + "loss": 1.2565, + "step": 8414 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.8421973586082458, + "learning_rate": 5.745165536093956e-05, + "loss": 1.6682, + "step": 8415 + }, + { + "epoch": 2.5545606313552893, + "grad_norm": 0.5599844455718994, + "learning_rate": 5.744659309506936e-05, + "loss": 1.1791, + "step": 8416 + }, + { + "epoch": 2.5548641675519805, + "grad_norm": 0.6801798343658447, + "learning_rate": 5.7441530829199154e-05, + "loss": 1.5051, + "step": 8417 + }, + { + "epoch": 2.555167703748672, + "grad_norm": 0.7134249806404114, + "learning_rate": 5.743646856332895e-05, + "loss": 1.5552, + "step": 8418 + }, + { + "epoch": 2.5554712399453634, + "grad_norm": 0.9280962944030762, + "learning_rate": 5.7431406297458744e-05, + "loss": 1.2458, + "step": 8419 + }, + { + "epoch": 2.555774776142055, + "grad_norm": 0.7428077459335327, + "learning_rate": 5.742634403158854e-05, + "loss": 1.5828, + "step": 8420 + }, + { + "epoch": 2.5560783123387463, + "grad_norm": 0.7723643183708191, + "learning_rate": 5.742128176571834e-05, + "loss": 1.5052, + "step": 8421 + }, + { + "epoch": 2.556381848535438, + "grad_norm": 0.7478247880935669, + "learning_rate": 5.7416219499848135e-05, + "loss": 1.3942, + "step": 8422 + }, + { + "epoch": 2.556685384732129, + "grad_norm": 0.7729806303977966, + "learning_rate": 5.741115723397793e-05, + "loss": 1.5192, + "step": 8423 + }, + { + "epoch": 2.556988920928821, + "grad_norm": 0.7728172540664673, + "learning_rate": 5.7406094968107724e-05, + "loss": 1.4517, + "step": 8424 + }, + { + "epoch": 2.557292457125512, + "grad_norm": 0.6714379787445068, + "learning_rate": 5.7401032702237526e-05, + "loss": 1.4359, + "step": 8425 + }, + { + "epoch": 2.5575959933222037, + "grad_norm": 0.8857477307319641, + "learning_rate": 5.739597043636732e-05, + "loss": 1.0287, + "step": 8426 + }, + { + "epoch": 2.5578995295188953, + "grad_norm": 0.7782611846923828, + "learning_rate": 5.7390908170497115e-05, + "loss": 1.263, + "step": 8427 + }, + { + "epoch": 2.5582030657155865, + "grad_norm": 0.5905637741088867, + "learning_rate": 5.738584590462691e-05, + "loss": 0.7798, + "step": 8428 + }, + { + "epoch": 2.558506601912278, + "grad_norm": 0.5645215511322021, + "learning_rate": 5.7380783638756705e-05, + "loss": 1.5072, + "step": 8429 + }, + { + "epoch": 2.5588101381089694, + "grad_norm": 0.7701306343078613, + "learning_rate": 5.737572137288651e-05, + "loss": 1.5126, + "step": 8430 + }, + { + "epoch": 2.559113674305661, + "grad_norm": 0.6775439977645874, + "learning_rate": 5.73706591070163e-05, + "loss": 1.0372, + "step": 8431 + }, + { + "epoch": 2.5594172105023523, + "grad_norm": 0.5594154596328735, + "learning_rate": 5.7365596841146096e-05, + "loss": 1.779, + "step": 8432 + }, + { + "epoch": 2.559720746699044, + "grad_norm": 0.6067804098129272, + "learning_rate": 5.736053457527589e-05, + "loss": 1.6787, + "step": 8433 + }, + { + "epoch": 2.5600242828957356, + "grad_norm": 0.7488258481025696, + "learning_rate": 5.7355472309405686e-05, + "loss": 1.0234, + "step": 8434 + }, + { + "epoch": 2.560327819092427, + "grad_norm": 0.8093794584274292, + "learning_rate": 5.735041004353549e-05, + "loss": 1.4619, + "step": 8435 + }, + { + "epoch": 2.560631355289118, + "grad_norm": 0.8266646265983582, + "learning_rate": 5.734534777766528e-05, + "loss": 1.1253, + "step": 8436 + }, + { + "epoch": 2.5609348914858097, + "grad_norm": 0.6163046360015869, + "learning_rate": 5.734028551179508e-05, + "loss": 1.0671, + "step": 8437 + }, + { + "epoch": 2.5612384276825013, + "grad_norm": 1.0120611190795898, + "learning_rate": 5.733522324592487e-05, + "loss": 1.1202, + "step": 8438 + }, + { + "epoch": 2.5615419638791925, + "grad_norm": 0.875623345375061, + "learning_rate": 5.733016098005467e-05, + "loss": 1.3208, + "step": 8439 + }, + { + "epoch": 2.561845500075884, + "grad_norm": 0.673265814781189, + "learning_rate": 5.732509871418447e-05, + "loss": 1.5828, + "step": 8440 + }, + { + "epoch": 2.5621490362725754, + "grad_norm": 0.8331916332244873, + "learning_rate": 5.732003644831426e-05, + "loss": 1.458, + "step": 8441 + }, + { + "epoch": 2.562452572469267, + "grad_norm": 0.847938060760498, + "learning_rate": 5.731497418244407e-05, + "loss": 0.7955, + "step": 8442 + }, + { + "epoch": 2.5627561086659583, + "grad_norm": 0.8607446551322937, + "learning_rate": 5.7309911916573866e-05, + "loss": 1.2457, + "step": 8443 + }, + { + "epoch": 2.56305964486265, + "grad_norm": 0.8080736994743347, + "learning_rate": 5.730484965070366e-05, + "loss": 1.347, + "step": 8444 + }, + { + "epoch": 2.5633631810593416, + "grad_norm": 1.0638771057128906, + "learning_rate": 5.7299787384833456e-05, + "loss": 0.7558, + "step": 8445 + }, + { + "epoch": 2.563666717256033, + "grad_norm": 0.8940908908843994, + "learning_rate": 5.729472511896326e-05, + "loss": 1.2289, + "step": 8446 + }, + { + "epoch": 2.563970253452724, + "grad_norm": 0.7230290174484253, + "learning_rate": 5.728966285309305e-05, + "loss": 0.827, + "step": 8447 + }, + { + "epoch": 2.5642737896494157, + "grad_norm": 0.9477673768997192, + "learning_rate": 5.728460058722285e-05, + "loss": 1.1062, + "step": 8448 + }, + { + "epoch": 2.5645773258461073, + "grad_norm": 0.858997642993927, + "learning_rate": 5.727953832135264e-05, + "loss": 1.2228, + "step": 8449 + }, + { + "epoch": 2.5648808620427985, + "grad_norm": 0.804364025592804, + "learning_rate": 5.727447605548244e-05, + "loss": 1.0802, + "step": 8450 + }, + { + "epoch": 2.56518439823949, + "grad_norm": 0.7806909680366516, + "learning_rate": 5.726941378961224e-05, + "loss": 0.9608, + "step": 8451 + }, + { + "epoch": 2.5654879344361814, + "grad_norm": 0.8432273268699646, + "learning_rate": 5.726435152374203e-05, + "loss": 1.1938, + "step": 8452 + }, + { + "epoch": 2.565791470632873, + "grad_norm": 0.7382596731185913, + "learning_rate": 5.725928925787183e-05, + "loss": 1.6108, + "step": 8453 + }, + { + "epoch": 2.5660950068295643, + "grad_norm": 0.7803032398223877, + "learning_rate": 5.725422699200162e-05, + "loss": 1.0205, + "step": 8454 + }, + { + "epoch": 2.566398543026256, + "grad_norm": 0.8174391984939575, + "learning_rate": 5.7249164726131424e-05, + "loss": 1.4251, + "step": 8455 + }, + { + "epoch": 2.5667020792229476, + "grad_norm": 0.6124516725540161, + "learning_rate": 5.724410246026122e-05, + "loss": 0.953, + "step": 8456 + }, + { + "epoch": 2.567005615419639, + "grad_norm": 0.694999098777771, + "learning_rate": 5.7239040194391014e-05, + "loss": 1.6602, + "step": 8457 + }, + { + "epoch": 2.56730915161633, + "grad_norm": 0.8906886577606201, + "learning_rate": 5.723397792852081e-05, + "loss": 1.3543, + "step": 8458 + }, + { + "epoch": 2.5676126878130217, + "grad_norm": 0.6976550817489624, + "learning_rate": 5.72289156626506e-05, + "loss": 1.7041, + "step": 8459 + }, + { + "epoch": 2.5679162240097133, + "grad_norm": 0.7767781019210815, + "learning_rate": 5.7223853396780405e-05, + "loss": 1.1142, + "step": 8460 + }, + { + "epoch": 2.5682197602064045, + "grad_norm": 0.6639768481254578, + "learning_rate": 5.72187911309102e-05, + "loss": 1.109, + "step": 8461 + }, + { + "epoch": 2.568523296403096, + "grad_norm": 0.6556784510612488, + "learning_rate": 5.7213728865039994e-05, + "loss": 0.714, + "step": 8462 + }, + { + "epoch": 2.5688268325997874, + "grad_norm": 0.7635631561279297, + "learning_rate": 5.720866659916979e-05, + "loss": 1.3227, + "step": 8463 + }, + { + "epoch": 2.569130368796479, + "grad_norm": 0.7205228209495544, + "learning_rate": 5.720360433329959e-05, + "loss": 1.117, + "step": 8464 + }, + { + "epoch": 2.5694339049931703, + "grad_norm": 0.8992099165916443, + "learning_rate": 5.7198542067429385e-05, + "loss": 1.2895, + "step": 8465 + }, + { + "epoch": 2.569737441189862, + "grad_norm": 0.7975298166275024, + "learning_rate": 5.719347980155918e-05, + "loss": 1.495, + "step": 8466 + }, + { + "epoch": 2.5700409773865536, + "grad_norm": 0.7453858256340027, + "learning_rate": 5.7188417535688975e-05, + "loss": 1.3353, + "step": 8467 + }, + { + "epoch": 2.570344513583245, + "grad_norm": 0.6829327344894409, + "learning_rate": 5.718335526981877e-05, + "loss": 1.6156, + "step": 8468 + }, + { + "epoch": 2.570648049779936, + "grad_norm": 0.7259297966957092, + "learning_rate": 5.717829300394857e-05, + "loss": 1.0498, + "step": 8469 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.805950403213501, + "learning_rate": 5.7173230738078366e-05, + "loss": 0.884, + "step": 8470 + }, + { + "epoch": 2.5712551221733193, + "grad_norm": 0.9011547565460205, + "learning_rate": 5.716816847220816e-05, + "loss": 1.3437, + "step": 8471 + }, + { + "epoch": 2.5715586583700105, + "grad_norm": 0.9217641949653625, + "learning_rate": 5.7163106206337956e-05, + "loss": 1.2101, + "step": 8472 + }, + { + "epoch": 2.571862194566702, + "grad_norm": 0.47957077622413635, + "learning_rate": 5.715804394046775e-05, + "loss": 0.6725, + "step": 8473 + }, + { + "epoch": 2.5721657307633934, + "grad_norm": 0.8739991784095764, + "learning_rate": 5.715298167459755e-05, + "loss": 1.1752, + "step": 8474 + }, + { + "epoch": 2.572469266960085, + "grad_norm": 0.7338137030601501, + "learning_rate": 5.714791940872735e-05, + "loss": 1.0399, + "step": 8475 + }, + { + "epoch": 2.5727728031567763, + "grad_norm": 0.8827327489852905, + "learning_rate": 5.714285714285714e-05, + "loss": 1.3659, + "step": 8476 + }, + { + "epoch": 2.573076339353468, + "grad_norm": 0.8811532258987427, + "learning_rate": 5.7137794876986937e-05, + "loss": 1.1658, + "step": 8477 + }, + { + "epoch": 2.5733798755501596, + "grad_norm": 0.6947214007377625, + "learning_rate": 5.713273261111674e-05, + "loss": 1.4052, + "step": 8478 + }, + { + "epoch": 2.573683411746851, + "grad_norm": 0.9203129410743713, + "learning_rate": 5.712767034524653e-05, + "loss": 1.3322, + "step": 8479 + }, + { + "epoch": 2.573986947943542, + "grad_norm": 0.9946419596672058, + "learning_rate": 5.712260807937633e-05, + "loss": 1.6423, + "step": 8480 + }, + { + "epoch": 2.5742904841402336, + "grad_norm": 0.8030951023101807, + "learning_rate": 5.711754581350612e-05, + "loss": 1.0093, + "step": 8481 + }, + { + "epoch": 2.5745940203369253, + "grad_norm": 0.9551969766616821, + "learning_rate": 5.711248354763592e-05, + "loss": 1.2978, + "step": 8482 + }, + { + "epoch": 2.5748975565336165, + "grad_norm": 1.039363980293274, + "learning_rate": 5.710742128176572e-05, + "loss": 1.4354, + "step": 8483 + }, + { + "epoch": 2.575201092730308, + "grad_norm": 0.6756881475448608, + "learning_rate": 5.7102359015895514e-05, + "loss": 1.1468, + "step": 8484 + }, + { + "epoch": 2.5755046289269994, + "grad_norm": 0.8306137919425964, + "learning_rate": 5.709729675002531e-05, + "loss": 1.5781, + "step": 8485 + }, + { + "epoch": 2.575808165123691, + "grad_norm": 0.8153355121612549, + "learning_rate": 5.709223448415512e-05, + "loss": 1.1197, + "step": 8486 + }, + { + "epoch": 2.5761117013203823, + "grad_norm": 0.8328362703323364, + "learning_rate": 5.708717221828491e-05, + "loss": 1.305, + "step": 8487 + }, + { + "epoch": 2.576415237517074, + "grad_norm": 0.7848013043403625, + "learning_rate": 5.7082109952414706e-05, + "loss": 1.3205, + "step": 8488 + }, + { + "epoch": 2.5767187737137656, + "grad_norm": 0.7196735739707947, + "learning_rate": 5.707704768654451e-05, + "loss": 1.3499, + "step": 8489 + }, + { + "epoch": 2.5770223099104568, + "grad_norm": 0.8000575304031372, + "learning_rate": 5.70719854206743e-05, + "loss": 1.1835, + "step": 8490 + }, + { + "epoch": 2.5773258461071484, + "grad_norm": 0.8718536496162415, + "learning_rate": 5.70669231548041e-05, + "loss": 1.2739, + "step": 8491 + }, + { + "epoch": 2.5776293823038396, + "grad_norm": 0.7113871574401855, + "learning_rate": 5.706186088893389e-05, + "loss": 1.5288, + "step": 8492 + }, + { + "epoch": 2.5779329185005313, + "grad_norm": 0.8177202343940735, + "learning_rate": 5.705679862306369e-05, + "loss": 1.2114, + "step": 8493 + }, + { + "epoch": 2.5782364546972225, + "grad_norm": 0.685756266117096, + "learning_rate": 5.705173635719349e-05, + "loss": 1.6081, + "step": 8494 + }, + { + "epoch": 2.578539990893914, + "grad_norm": 0.7527011036872864, + "learning_rate": 5.7046674091323283e-05, + "loss": 1.2229, + "step": 8495 + }, + { + "epoch": 2.578843527090606, + "grad_norm": 0.8383612036705017, + "learning_rate": 5.704161182545308e-05, + "loss": 1.3744, + "step": 8496 + }, + { + "epoch": 2.579147063287297, + "grad_norm": 0.7893360257148743, + "learning_rate": 5.703654955958287e-05, + "loss": 1.0301, + "step": 8497 + }, + { + "epoch": 2.5794505994839882, + "grad_norm": 0.766526997089386, + "learning_rate": 5.703148729371267e-05, + "loss": 1.1499, + "step": 8498 + }, + { + "epoch": 2.57975413568068, + "grad_norm": 1.030303955078125, + "learning_rate": 5.702642502784247e-05, + "loss": 1.3511, + "step": 8499 + }, + { + "epoch": 2.5800576718773716, + "grad_norm": 0.8071532845497131, + "learning_rate": 5.7021362761972264e-05, + "loss": 1.2655, + "step": 8500 + }, + { + "epoch": 2.5803612080740628, + "grad_norm": 0.8044392466545105, + "learning_rate": 5.701630049610206e-05, + "loss": 1.3552, + "step": 8501 + }, + { + "epoch": 2.5806647442707544, + "grad_norm": 0.881322979927063, + "learning_rate": 5.7011238230231854e-05, + "loss": 1.4468, + "step": 8502 + }, + { + "epoch": 2.5809682804674456, + "grad_norm": 0.8353243470191956, + "learning_rate": 5.7006175964361655e-05, + "loss": 1.2619, + "step": 8503 + }, + { + "epoch": 2.5812718166641373, + "grad_norm": 0.775505542755127, + "learning_rate": 5.700111369849145e-05, + "loss": 1.3108, + "step": 8504 + }, + { + "epoch": 2.5815753528608285, + "grad_norm": 0.7685757875442505, + "learning_rate": 5.6996051432621245e-05, + "loss": 1.5786, + "step": 8505 + }, + { + "epoch": 2.58187888905752, + "grad_norm": 0.7894332408905029, + "learning_rate": 5.699098916675104e-05, + "loss": 1.4572, + "step": 8506 + }, + { + "epoch": 2.582182425254212, + "grad_norm": 0.8481132984161377, + "learning_rate": 5.6985926900880835e-05, + "loss": 1.4722, + "step": 8507 + }, + { + "epoch": 2.582485961450903, + "grad_norm": 0.7238352298736572, + "learning_rate": 5.6980864635010636e-05, + "loss": 1.2255, + "step": 8508 + }, + { + "epoch": 2.5827894976475942, + "grad_norm": 0.82459557056427, + "learning_rate": 5.697580236914043e-05, + "loss": 1.0108, + "step": 8509 + }, + { + "epoch": 2.583093033844286, + "grad_norm": 0.5827746987342834, + "learning_rate": 5.6970740103270226e-05, + "loss": 1.1717, + "step": 8510 + }, + { + "epoch": 2.5833965700409776, + "grad_norm": 0.8375948071479797, + "learning_rate": 5.696567783740002e-05, + "loss": 1.2577, + "step": 8511 + }, + { + "epoch": 2.5837001062376688, + "grad_norm": 0.7858525514602661, + "learning_rate": 5.6960615571529815e-05, + "loss": 1.0145, + "step": 8512 + }, + { + "epoch": 2.5840036424343604, + "grad_norm": 0.8637663722038269, + "learning_rate": 5.695555330565962e-05, + "loss": 0.8378, + "step": 8513 + }, + { + "epoch": 2.5843071786310516, + "grad_norm": 0.8522406816482544, + "learning_rate": 5.695049103978941e-05, + "loss": 1.1828, + "step": 8514 + }, + { + "epoch": 2.5846107148277433, + "grad_norm": 0.7590748071670532, + "learning_rate": 5.6945428773919206e-05, + "loss": 1.5535, + "step": 8515 + }, + { + "epoch": 2.5849142510244345, + "grad_norm": 0.853203296661377, + "learning_rate": 5.6940366508049e-05, + "loss": 1.4439, + "step": 8516 + }, + { + "epoch": 2.585217787221126, + "grad_norm": 0.6160378456115723, + "learning_rate": 5.69353042421788e-05, + "loss": 1.6231, + "step": 8517 + }, + { + "epoch": 2.585521323417818, + "grad_norm": 0.7248274087905884, + "learning_rate": 5.69302419763086e-05, + "loss": 0.9876, + "step": 8518 + }, + { + "epoch": 2.585824859614509, + "grad_norm": 0.7577129006385803, + "learning_rate": 5.692517971043839e-05, + "loss": 1.166, + "step": 8519 + }, + { + "epoch": 2.5861283958112002, + "grad_norm": 0.9573093056678772, + "learning_rate": 5.692011744456819e-05, + "loss": 1.1766, + "step": 8520 + }, + { + "epoch": 2.586431932007892, + "grad_norm": 0.6318233609199524, + "learning_rate": 5.691505517869798e-05, + "loss": 1.7379, + "step": 8521 + }, + { + "epoch": 2.5867354682045836, + "grad_norm": 1.0729773044586182, + "learning_rate": 5.6909992912827784e-05, + "loss": 0.9821, + "step": 8522 + }, + { + "epoch": 2.5870390044012748, + "grad_norm": 1.0490633249282837, + "learning_rate": 5.690493064695758e-05, + "loss": 1.0339, + "step": 8523 + }, + { + "epoch": 2.5873425405979664, + "grad_norm": 0.7213556170463562, + "learning_rate": 5.689986838108737e-05, + "loss": 1.4523, + "step": 8524 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.7995901703834534, + "learning_rate": 5.689480611521717e-05, + "loss": 1.608, + "step": 8525 + }, + { + "epoch": 2.5879496129913493, + "grad_norm": 0.7987606525421143, + "learning_rate": 5.688974384934696e-05, + "loss": 1.0311, + "step": 8526 + }, + { + "epoch": 2.5882531491880405, + "grad_norm": 0.9713065028190613, + "learning_rate": 5.6884681583476764e-05, + "loss": 1.3344, + "step": 8527 + }, + { + "epoch": 2.588556685384732, + "grad_norm": 0.7180582880973816, + "learning_rate": 5.687961931760656e-05, + "loss": 1.5602, + "step": 8528 + }, + { + "epoch": 2.588860221581424, + "grad_norm": 0.8596640229225159, + "learning_rate": 5.6874557051736354e-05, + "loss": 1.5416, + "step": 8529 + }, + { + "epoch": 2.589163757778115, + "grad_norm": 0.8000577092170715, + "learning_rate": 5.686949478586615e-05, + "loss": 1.4195, + "step": 8530 + }, + { + "epoch": 2.5894672939748062, + "grad_norm": 0.5756254196166992, + "learning_rate": 5.686443251999596e-05, + "loss": 1.09, + "step": 8531 + }, + { + "epoch": 2.589770830171498, + "grad_norm": 0.6958990693092346, + "learning_rate": 5.685937025412575e-05, + "loss": 1.7655, + "step": 8532 + }, + { + "epoch": 2.5900743663681896, + "grad_norm": 0.6716173887252808, + "learning_rate": 5.6854307988255553e-05, + "loss": 1.1914, + "step": 8533 + }, + { + "epoch": 2.5903779025648808, + "grad_norm": 0.6973176002502441, + "learning_rate": 5.684924572238535e-05, + "loss": 1.4955, + "step": 8534 + }, + { + "epoch": 2.5906814387615724, + "grad_norm": 0.7571681141853333, + "learning_rate": 5.684418345651514e-05, + "loss": 1.4925, + "step": 8535 + }, + { + "epoch": 2.5909849749582636, + "grad_norm": 0.7972629070281982, + "learning_rate": 5.683912119064494e-05, + "loss": 0.9581, + "step": 8536 + }, + { + "epoch": 2.5912885111549553, + "grad_norm": 0.5946874022483826, + "learning_rate": 5.683405892477473e-05, + "loss": 1.4677, + "step": 8537 + }, + { + "epoch": 2.5915920473516465, + "grad_norm": 0.721113920211792, + "learning_rate": 5.6828996658904534e-05, + "loss": 1.1401, + "step": 8538 + }, + { + "epoch": 2.591895583548338, + "grad_norm": 0.8047667145729065, + "learning_rate": 5.682393439303433e-05, + "loss": 1.4372, + "step": 8539 + }, + { + "epoch": 2.59219911974503, + "grad_norm": 0.7815547585487366, + "learning_rate": 5.6818872127164124e-05, + "loss": 1.5106, + "step": 8540 + }, + { + "epoch": 2.592502655941721, + "grad_norm": 0.6391544342041016, + "learning_rate": 5.681380986129392e-05, + "loss": 1.6492, + "step": 8541 + }, + { + "epoch": 2.5928061921384122, + "grad_norm": 0.8339974284172058, + "learning_rate": 5.680874759542372e-05, + "loss": 1.3008, + "step": 8542 + }, + { + "epoch": 2.593109728335104, + "grad_norm": 0.8036449551582336, + "learning_rate": 5.6803685329553515e-05, + "loss": 1.384, + "step": 8543 + }, + { + "epoch": 2.5934132645317955, + "grad_norm": 0.712510883808136, + "learning_rate": 5.679862306368331e-05, + "loss": 1.2614, + "step": 8544 + }, + { + "epoch": 2.5937168007284868, + "grad_norm": 0.6258426904678345, + "learning_rate": 5.6793560797813105e-05, + "loss": 1.3732, + "step": 8545 + }, + { + "epoch": 2.5940203369251784, + "grad_norm": 0.8587894439697266, + "learning_rate": 5.67884985319429e-05, + "loss": 0.8554, + "step": 8546 + }, + { + "epoch": 2.5943238731218696, + "grad_norm": 0.9387930631637573, + "learning_rate": 5.67834362660727e-05, + "loss": 1.2503, + "step": 8547 + }, + { + "epoch": 2.5946274093185613, + "grad_norm": 0.8170998096466064, + "learning_rate": 5.6778374000202496e-05, + "loss": 1.6741, + "step": 8548 + }, + { + "epoch": 2.5949309455152525, + "grad_norm": 0.7400410771369934, + "learning_rate": 5.677331173433229e-05, + "loss": 1.3155, + "step": 8549 + }, + { + "epoch": 2.595234481711944, + "grad_norm": 0.738135814666748, + "learning_rate": 5.6768249468462085e-05, + "loss": 1.4813, + "step": 8550 + }, + { + "epoch": 2.595538017908636, + "grad_norm": 0.9240551590919495, + "learning_rate": 5.676318720259188e-05, + "loss": 1.6402, + "step": 8551 + }, + { + "epoch": 2.595841554105327, + "grad_norm": 0.7406201958656311, + "learning_rate": 5.675812493672168e-05, + "loss": 1.7953, + "step": 8552 + }, + { + "epoch": 2.5961450903020187, + "grad_norm": 0.7671360969543457, + "learning_rate": 5.6753062670851476e-05, + "loss": 1.1308, + "step": 8553 + }, + { + "epoch": 2.59644862649871, + "grad_norm": 0.6881118416786194, + "learning_rate": 5.674800040498127e-05, + "loss": 1.79, + "step": 8554 + }, + { + "epoch": 2.5967521626954015, + "grad_norm": 0.7133281826972961, + "learning_rate": 5.6742938139111066e-05, + "loss": 1.0629, + "step": 8555 + }, + { + "epoch": 2.5970556988920928, + "grad_norm": 0.7810043096542358, + "learning_rate": 5.673787587324087e-05, + "loss": 1.6822, + "step": 8556 + }, + { + "epoch": 2.5973592350887844, + "grad_norm": 0.8383316397666931, + "learning_rate": 5.673281360737066e-05, + "loss": 1.3929, + "step": 8557 + }, + { + "epoch": 2.597662771285476, + "grad_norm": 0.8424170613288879, + "learning_rate": 5.672775134150046e-05, + "loss": 1.0364, + "step": 8558 + }, + { + "epoch": 2.5979663074821673, + "grad_norm": 0.7133161425590515, + "learning_rate": 5.672268907563025e-05, + "loss": 1.5446, + "step": 8559 + }, + { + "epoch": 2.5982698436788585, + "grad_norm": 0.5788865685462952, + "learning_rate": 5.671762680976005e-05, + "loss": 1.5501, + "step": 8560 + }, + { + "epoch": 2.59857337987555, + "grad_norm": 0.692284882068634, + "learning_rate": 5.671256454388985e-05, + "loss": 1.6923, + "step": 8561 + }, + { + "epoch": 2.598876916072242, + "grad_norm": 0.7362749576568604, + "learning_rate": 5.670750227801964e-05, + "loss": 1.6645, + "step": 8562 + }, + { + "epoch": 2.599180452268933, + "grad_norm": 0.6713787317276001, + "learning_rate": 5.670244001214944e-05, + "loss": 1.4341, + "step": 8563 + }, + { + "epoch": 2.5994839884656247, + "grad_norm": 0.8715011477470398, + "learning_rate": 5.669737774627923e-05, + "loss": 1.0799, + "step": 8564 + }, + { + "epoch": 2.599787524662316, + "grad_norm": 0.6888600587844849, + "learning_rate": 5.669231548040903e-05, + "loss": 1.7989, + "step": 8565 + }, + { + "epoch": 2.6000910608590075, + "grad_norm": 0.6829241514205933, + "learning_rate": 5.668725321453883e-05, + "loss": 1.236, + "step": 8566 + }, + { + "epoch": 2.6003945970556988, + "grad_norm": 0.79676753282547, + "learning_rate": 5.6682190948668624e-05, + "loss": 1.4303, + "step": 8567 + }, + { + "epoch": 2.6006981332523904, + "grad_norm": 0.84662926197052, + "learning_rate": 5.667712868279842e-05, + "loss": 1.3441, + "step": 8568 + }, + { + "epoch": 2.601001669449082, + "grad_norm": 0.8637576103210449, + "learning_rate": 5.6672066416928213e-05, + "loss": 1.2666, + "step": 8569 + }, + { + "epoch": 2.6013052056457733, + "grad_norm": 0.9721615314483643, + "learning_rate": 5.6667004151058015e-05, + "loss": 1.328, + "step": 8570 + }, + { + "epoch": 2.6016087418424645, + "grad_norm": 0.7033756375312805, + "learning_rate": 5.666194188518781e-05, + "loss": 1.3619, + "step": 8571 + }, + { + "epoch": 2.601912278039156, + "grad_norm": 0.8240988850593567, + "learning_rate": 5.6656879619317605e-05, + "loss": 1.3551, + "step": 8572 + }, + { + "epoch": 2.602215814235848, + "grad_norm": 0.6749477386474609, + "learning_rate": 5.66518173534474e-05, + "loss": 1.5243, + "step": 8573 + }, + { + "epoch": 2.602519350432539, + "grad_norm": 0.7842938303947449, + "learning_rate": 5.6646755087577194e-05, + "loss": 1.4415, + "step": 8574 + }, + { + "epoch": 2.6028228866292307, + "grad_norm": 0.7162943482398987, + "learning_rate": 5.6641692821707e-05, + "loss": 1.7193, + "step": 8575 + }, + { + "epoch": 2.603126422825922, + "grad_norm": 0.7127130031585693, + "learning_rate": 5.66366305558368e-05, + "loss": 0.9039, + "step": 8576 + }, + { + "epoch": 2.6034299590226135, + "grad_norm": 0.7178913354873657, + "learning_rate": 5.66315682899666e-05, + "loss": 1.1871, + "step": 8577 + }, + { + "epoch": 2.6037334952193047, + "grad_norm": 0.801655650138855, + "learning_rate": 5.6626506024096394e-05, + "loss": 1.5558, + "step": 8578 + }, + { + "epoch": 2.6040370314159964, + "grad_norm": 0.8066810965538025, + "learning_rate": 5.662144375822619e-05, + "loss": 0.9176, + "step": 8579 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.6406304836273193, + "learning_rate": 5.661638149235598e-05, + "loss": 1.3271, + "step": 8580 + }, + { + "epoch": 2.6046441038093793, + "grad_norm": 0.7086127996444702, + "learning_rate": 5.6611319226485785e-05, + "loss": 1.4672, + "step": 8581 + }, + { + "epoch": 2.6049476400060705, + "grad_norm": 0.774001955986023, + "learning_rate": 5.660625696061558e-05, + "loss": 1.058, + "step": 8582 + }, + { + "epoch": 2.605251176202762, + "grad_norm": 0.90226811170578, + "learning_rate": 5.6601194694745374e-05, + "loss": 0.9913, + "step": 8583 + }, + { + "epoch": 2.605554712399454, + "grad_norm": 0.6938377022743225, + "learning_rate": 5.659613242887517e-05, + "loss": 1.1436, + "step": 8584 + }, + { + "epoch": 2.605858248596145, + "grad_norm": 0.776210606098175, + "learning_rate": 5.6591070163004964e-05, + "loss": 1.4291, + "step": 8585 + }, + { + "epoch": 2.6061617847928367, + "grad_norm": 0.7756494283676147, + "learning_rate": 5.6586007897134766e-05, + "loss": 1.3007, + "step": 8586 + }, + { + "epoch": 2.606465320989528, + "grad_norm": 0.8304345011711121, + "learning_rate": 5.658094563126456e-05, + "loss": 1.1779, + "step": 8587 + }, + { + "epoch": 2.6067688571862195, + "grad_norm": 0.9503403306007385, + "learning_rate": 5.6575883365394355e-05, + "loss": 1.2729, + "step": 8588 + }, + { + "epoch": 2.6070723933829107, + "grad_norm": 0.7229785323143005, + "learning_rate": 5.657082109952415e-05, + "loss": 1.4943, + "step": 8589 + }, + { + "epoch": 2.6073759295796024, + "grad_norm": 0.7801215052604675, + "learning_rate": 5.6565758833653945e-05, + "loss": 1.387, + "step": 8590 + }, + { + "epoch": 2.607679465776294, + "grad_norm": 0.7345092296600342, + "learning_rate": 5.6560696567783746e-05, + "loss": 1.395, + "step": 8591 + }, + { + "epoch": 2.6079830019729853, + "grad_norm": 0.8644165992736816, + "learning_rate": 5.655563430191354e-05, + "loss": 1.3065, + "step": 8592 + }, + { + "epoch": 2.6082865381696765, + "grad_norm": 0.7582297921180725, + "learning_rate": 5.6550572036043336e-05, + "loss": 1.4935, + "step": 8593 + }, + { + "epoch": 2.608590074366368, + "grad_norm": 0.9125105142593384, + "learning_rate": 5.654550977017313e-05, + "loss": 1.2867, + "step": 8594 + }, + { + "epoch": 2.60889361056306, + "grad_norm": 0.8298311233520508, + "learning_rate": 5.654044750430293e-05, + "loss": 1.4948, + "step": 8595 + }, + { + "epoch": 2.609197146759751, + "grad_norm": 0.9472196698188782, + "learning_rate": 5.653538523843273e-05, + "loss": 0.7515, + "step": 8596 + }, + { + "epoch": 2.6095006829564427, + "grad_norm": 1.047179102897644, + "learning_rate": 5.653032297256252e-05, + "loss": 0.9547, + "step": 8597 + }, + { + "epoch": 2.609804219153134, + "grad_norm": 0.7703762054443359, + "learning_rate": 5.652526070669232e-05, + "loss": 1.4232, + "step": 8598 + }, + { + "epoch": 2.6101077553498255, + "grad_norm": 0.6805330514907837, + "learning_rate": 5.652019844082211e-05, + "loss": 1.2739, + "step": 8599 + }, + { + "epoch": 2.6104112915465167, + "grad_norm": 0.8809008598327637, + "learning_rate": 5.651513617495191e-05, + "loss": 1.1999, + "step": 8600 + }, + { + "epoch": 2.6107148277432084, + "grad_norm": 0.7000548839569092, + "learning_rate": 5.651007390908171e-05, + "loss": 1.0212, + "step": 8601 + }, + { + "epoch": 2.6110183639399, + "grad_norm": 0.8035356402397156, + "learning_rate": 5.65050116432115e-05, + "loss": 1.5589, + "step": 8602 + }, + { + "epoch": 2.6113219001365913, + "grad_norm": 0.9235441088676453, + "learning_rate": 5.64999493773413e-05, + "loss": 1.2626, + "step": 8603 + }, + { + "epoch": 2.6116254363332825, + "grad_norm": 0.8452057242393494, + "learning_rate": 5.649488711147109e-05, + "loss": 1.2997, + "step": 8604 + }, + { + "epoch": 2.611928972529974, + "grad_norm": 0.8349605202674866, + "learning_rate": 5.6489824845600894e-05, + "loss": 1.4014, + "step": 8605 + }, + { + "epoch": 2.612232508726666, + "grad_norm": 0.586233913898468, + "learning_rate": 5.648476257973069e-05, + "loss": 0.8841, + "step": 8606 + }, + { + "epoch": 2.612536044923357, + "grad_norm": 1.050804853439331, + "learning_rate": 5.6479700313860483e-05, + "loss": 1.2541, + "step": 8607 + }, + { + "epoch": 2.6128395811200487, + "grad_norm": 0.7679193615913391, + "learning_rate": 5.647463804799028e-05, + "loss": 1.1245, + "step": 8608 + }, + { + "epoch": 2.61314311731674, + "grad_norm": 0.9099006056785583, + "learning_rate": 5.646957578212008e-05, + "loss": 1.2773, + "step": 8609 + }, + { + "epoch": 2.6134466535134315, + "grad_norm": 0.7303164601325989, + "learning_rate": 5.6464513516249875e-05, + "loss": 1.4331, + "step": 8610 + }, + { + "epoch": 2.6137501897101227, + "grad_norm": 0.9463585615158081, + "learning_rate": 5.645945125037967e-05, + "loss": 1.6328, + "step": 8611 + }, + { + "epoch": 2.6140537259068144, + "grad_norm": 0.7220118045806885, + "learning_rate": 5.6454388984509464e-05, + "loss": 1.1847, + "step": 8612 + }, + { + "epoch": 2.614357262103506, + "grad_norm": 0.8586870431900024, + "learning_rate": 5.644932671863926e-05, + "loss": 1.1935, + "step": 8613 + }, + { + "epoch": 2.6146607983001973, + "grad_norm": 0.7535932660102844, + "learning_rate": 5.644426445276906e-05, + "loss": 1.4595, + "step": 8614 + }, + { + "epoch": 2.614964334496889, + "grad_norm": 0.7344147562980652, + "learning_rate": 5.6439202186898855e-05, + "loss": 1.5222, + "step": 8615 + }, + { + "epoch": 2.61526787069358, + "grad_norm": 0.8207230567932129, + "learning_rate": 5.643413992102865e-05, + "loss": 1.3544, + "step": 8616 + }, + { + "epoch": 2.615571406890272, + "grad_norm": 1.018458366394043, + "learning_rate": 5.6429077655158445e-05, + "loss": 1.5002, + "step": 8617 + }, + { + "epoch": 2.615874943086963, + "grad_norm": 0.7685471177101135, + "learning_rate": 5.642401538928824e-05, + "loss": 1.6546, + "step": 8618 + }, + { + "epoch": 2.6161784792836547, + "grad_norm": 0.8092283010482788, + "learning_rate": 5.641895312341804e-05, + "loss": 1.3715, + "step": 8619 + }, + { + "epoch": 2.616482015480346, + "grad_norm": 0.9026480913162231, + "learning_rate": 5.641389085754785e-05, + "loss": 1.292, + "step": 8620 + }, + { + "epoch": 2.6167855516770375, + "grad_norm": 0.8444890975952148, + "learning_rate": 5.6408828591677644e-05, + "loss": 1.2972, + "step": 8621 + }, + { + "epoch": 2.6170890878737287, + "grad_norm": 1.0628342628479004, + "learning_rate": 5.640376632580744e-05, + "loss": 0.8799, + "step": 8622 + }, + { + "epoch": 2.6173926240704204, + "grad_norm": 0.9359757304191589, + "learning_rate": 5.6398704059937234e-05, + "loss": 1.0312, + "step": 8623 + }, + { + "epoch": 2.617696160267112, + "grad_norm": 0.6911135315895081, + "learning_rate": 5.639364179406703e-05, + "loss": 0.7175, + "step": 8624 + }, + { + "epoch": 2.6179996964638033, + "grad_norm": 0.7658038139343262, + "learning_rate": 5.638857952819683e-05, + "loss": 1.4084, + "step": 8625 + }, + { + "epoch": 2.618303232660495, + "grad_norm": 0.844971776008606, + "learning_rate": 5.6383517262326625e-05, + "loss": 1.1376, + "step": 8626 + }, + { + "epoch": 2.618606768857186, + "grad_norm": 0.8520523309707642, + "learning_rate": 5.637845499645642e-05, + "loss": 1.5798, + "step": 8627 + }, + { + "epoch": 2.618910305053878, + "grad_norm": 0.7942187786102295, + "learning_rate": 5.6373392730586215e-05, + "loss": 0.9526, + "step": 8628 + }, + { + "epoch": 2.619213841250569, + "grad_norm": 0.7984597086906433, + "learning_rate": 5.636833046471601e-05, + "loss": 1.3979, + "step": 8629 + }, + { + "epoch": 2.6195173774472607, + "grad_norm": 0.9261837005615234, + "learning_rate": 5.636326819884581e-05, + "loss": 1.6082, + "step": 8630 + }, + { + "epoch": 2.6198209136439523, + "grad_norm": 0.77427077293396, + "learning_rate": 5.6358205932975606e-05, + "loss": 1.0873, + "step": 8631 + }, + { + "epoch": 2.6201244498406435, + "grad_norm": 0.7438533306121826, + "learning_rate": 5.63531436671054e-05, + "loss": 1.2523, + "step": 8632 + }, + { + "epoch": 2.6204279860373347, + "grad_norm": 0.7224971055984497, + "learning_rate": 5.6348081401235196e-05, + "loss": 1.3656, + "step": 8633 + }, + { + "epoch": 2.6207315222340264, + "grad_norm": 0.7457458972930908, + "learning_rate": 5.6343019135365e-05, + "loss": 1.5195, + "step": 8634 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.6107844114303589, + "learning_rate": 5.633795686949479e-05, + "loss": 1.7412, + "step": 8635 + }, + { + "epoch": 2.6213385946274093, + "grad_norm": 0.6721503734588623, + "learning_rate": 5.633289460362459e-05, + "loss": 1.691, + "step": 8636 + }, + { + "epoch": 2.621642130824101, + "grad_norm": 0.9568755626678467, + "learning_rate": 5.632783233775438e-05, + "loss": 1.0017, + "step": 8637 + }, + { + "epoch": 2.621945667020792, + "grad_norm": 0.8790843486785889, + "learning_rate": 5.6322770071884176e-05, + "loss": 1.4398, + "step": 8638 + }, + { + "epoch": 2.622249203217484, + "grad_norm": 0.8850890398025513, + "learning_rate": 5.631770780601398e-05, + "loss": 1.4364, + "step": 8639 + }, + { + "epoch": 2.622552739414175, + "grad_norm": 0.6632746458053589, + "learning_rate": 5.631264554014377e-05, + "loss": 1.1789, + "step": 8640 + }, + { + "epoch": 2.6228562756108666, + "grad_norm": 0.7869992852210999, + "learning_rate": 5.630758327427357e-05, + "loss": 1.3563, + "step": 8641 + }, + { + "epoch": 2.6231598118075583, + "grad_norm": 0.9065262079238892, + "learning_rate": 5.630252100840336e-05, + "loss": 1.6158, + "step": 8642 + }, + { + "epoch": 2.6234633480042495, + "grad_norm": 0.7699462175369263, + "learning_rate": 5.629745874253316e-05, + "loss": 1.6771, + "step": 8643 + }, + { + "epoch": 2.6237668842009407, + "grad_norm": 0.787337601184845, + "learning_rate": 5.629239647666296e-05, + "loss": 0.8428, + "step": 8644 + }, + { + "epoch": 2.6240704203976324, + "grad_norm": 0.8821600675582886, + "learning_rate": 5.628733421079275e-05, + "loss": 1.2923, + "step": 8645 + }, + { + "epoch": 2.624373956594324, + "grad_norm": 0.9925124049186707, + "learning_rate": 5.628227194492255e-05, + "loss": 1.528, + "step": 8646 + }, + { + "epoch": 2.6246774927910153, + "grad_norm": 0.8500638604164124, + "learning_rate": 5.627720967905234e-05, + "loss": 1.4636, + "step": 8647 + }, + { + "epoch": 2.624981028987707, + "grad_norm": 0.6999285221099854, + "learning_rate": 5.6272147413182145e-05, + "loss": 1.0709, + "step": 8648 + }, + { + "epoch": 2.625284565184398, + "grad_norm": 0.7922846674919128, + "learning_rate": 5.626708514731194e-05, + "loss": 1.2424, + "step": 8649 + }, + { + "epoch": 2.62558810138109, + "grad_norm": 0.7939204573631287, + "learning_rate": 5.6262022881441734e-05, + "loss": 1.6343, + "step": 8650 + }, + { + "epoch": 2.625891637577781, + "grad_norm": 0.9492472410202026, + "learning_rate": 5.625696061557153e-05, + "loss": 1.1092, + "step": 8651 + }, + { + "epoch": 2.6261951737744726, + "grad_norm": 0.7657208442687988, + "learning_rate": 5.6251898349701324e-05, + "loss": 1.2051, + "step": 8652 + }, + { + "epoch": 2.6264987099711643, + "grad_norm": 0.7393199801445007, + "learning_rate": 5.6246836083831125e-05, + "loss": 1.4236, + "step": 8653 + }, + { + "epoch": 2.6268022461678555, + "grad_norm": 0.8838475942611694, + "learning_rate": 5.624177381796092e-05, + "loss": 1.4203, + "step": 8654 + }, + { + "epoch": 2.6271057823645467, + "grad_norm": 0.768323540687561, + "learning_rate": 5.6236711552090715e-05, + "loss": 1.5179, + "step": 8655 + }, + { + "epoch": 2.6274093185612384, + "grad_norm": 0.8612476587295532, + "learning_rate": 5.623164928622051e-05, + "loss": 0.9426, + "step": 8656 + }, + { + "epoch": 2.62771285475793, + "grad_norm": 0.6874903440475464, + "learning_rate": 5.6226587020350304e-05, + "loss": 1.5948, + "step": 8657 + }, + { + "epoch": 2.6280163909546213, + "grad_norm": 1.2635430097579956, + "learning_rate": 5.6221524754480106e-05, + "loss": 1.0308, + "step": 8658 + }, + { + "epoch": 2.628319927151313, + "grad_norm": 0.7369987368583679, + "learning_rate": 5.62164624886099e-05, + "loss": 1.3982, + "step": 8659 + }, + { + "epoch": 2.628623463348004, + "grad_norm": 0.7699949741363525, + "learning_rate": 5.6211400222739696e-05, + "loss": 1.4156, + "step": 8660 + }, + { + "epoch": 2.6289269995446958, + "grad_norm": 0.6728193759918213, + "learning_rate": 5.620633795686949e-05, + "loss": 1.473, + "step": 8661 + }, + { + "epoch": 2.629230535741387, + "grad_norm": 0.669257402420044, + "learning_rate": 5.620127569099929e-05, + "loss": 0.4621, + "step": 8662 + }, + { + "epoch": 2.6295340719380786, + "grad_norm": 0.7065978050231934, + "learning_rate": 5.619621342512909e-05, + "loss": 1.4676, + "step": 8663 + }, + { + "epoch": 2.6298376081347703, + "grad_norm": 0.7422645688056946, + "learning_rate": 5.6191151159258895e-05, + "loss": 1.3818, + "step": 8664 + }, + { + "epoch": 2.6301411443314615, + "grad_norm": 0.8151669502258301, + "learning_rate": 5.618608889338869e-05, + "loss": 1.5563, + "step": 8665 + }, + { + "epoch": 2.6304446805281527, + "grad_norm": 0.6048735976219177, + "learning_rate": 5.6181026627518485e-05, + "loss": 1.3932, + "step": 8666 + }, + { + "epoch": 2.6307482167248444, + "grad_norm": 0.8093709349632263, + "learning_rate": 5.617596436164828e-05, + "loss": 1.4714, + "step": 8667 + }, + { + "epoch": 2.631051752921536, + "grad_norm": 0.904063880443573, + "learning_rate": 5.6170902095778074e-05, + "loss": 1.1276, + "step": 8668 + }, + { + "epoch": 2.6313552891182272, + "grad_norm": 0.9942129850387573, + "learning_rate": 5.6165839829907876e-05, + "loss": 1.3145, + "step": 8669 + }, + { + "epoch": 2.631658825314919, + "grad_norm": 0.9170172214508057, + "learning_rate": 5.616077756403767e-05, + "loss": 1.361, + "step": 8670 + }, + { + "epoch": 2.63196236151161, + "grad_norm": 0.968109667301178, + "learning_rate": 5.6155715298167465e-05, + "loss": 0.9555, + "step": 8671 + }, + { + "epoch": 2.6322658977083018, + "grad_norm": 0.9411510229110718, + "learning_rate": 5.615065303229726e-05, + "loss": 1.3459, + "step": 8672 + }, + { + "epoch": 2.632569433904993, + "grad_norm": 0.6706832647323608, + "learning_rate": 5.614559076642706e-05, + "loss": 1.3142, + "step": 8673 + }, + { + "epoch": 2.6328729701016846, + "grad_norm": 0.9438613057136536, + "learning_rate": 5.614052850055686e-05, + "loss": 1.4197, + "step": 8674 + }, + { + "epoch": 2.6331765062983763, + "grad_norm": 0.8801570534706116, + "learning_rate": 5.613546623468665e-05, + "loss": 1.3537, + "step": 8675 + }, + { + "epoch": 2.6334800424950675, + "grad_norm": 0.7939736247062683, + "learning_rate": 5.6130403968816446e-05, + "loss": 1.3053, + "step": 8676 + }, + { + "epoch": 2.6337835786917587, + "grad_norm": 0.7711239457130432, + "learning_rate": 5.612534170294624e-05, + "loss": 1.0631, + "step": 8677 + }, + { + "epoch": 2.6340871148884504, + "grad_norm": 0.8416082262992859, + "learning_rate": 5.612027943707604e-05, + "loss": 1.3773, + "step": 8678 + }, + { + "epoch": 2.634390651085142, + "grad_norm": 0.7541301846504211, + "learning_rate": 5.611521717120584e-05, + "loss": 1.4581, + "step": 8679 + }, + { + "epoch": 2.6346941872818332, + "grad_norm": 0.8372868299484253, + "learning_rate": 5.611015490533563e-05, + "loss": 1.2465, + "step": 8680 + }, + { + "epoch": 2.634997723478525, + "grad_norm": 0.613940954208374, + "learning_rate": 5.610509263946543e-05, + "loss": 0.8319, + "step": 8681 + }, + { + "epoch": 2.635301259675216, + "grad_norm": 0.6832600831985474, + "learning_rate": 5.610003037359522e-05, + "loss": 1.6382, + "step": 8682 + }, + { + "epoch": 2.6356047958719078, + "grad_norm": 0.8606064915657043, + "learning_rate": 5.609496810772502e-05, + "loss": 1.2397, + "step": 8683 + }, + { + "epoch": 2.635908332068599, + "grad_norm": 0.7566887736320496, + "learning_rate": 5.608990584185482e-05, + "loss": 1.3182, + "step": 8684 + }, + { + "epoch": 2.6362118682652906, + "grad_norm": 0.9536645412445068, + "learning_rate": 5.608484357598461e-05, + "loss": 1.334, + "step": 8685 + }, + { + "epoch": 2.6365154044619823, + "grad_norm": 0.7034269571304321, + "learning_rate": 5.607978131011441e-05, + "loss": 1.7559, + "step": 8686 + }, + { + "epoch": 2.6368189406586735, + "grad_norm": 0.8286762833595276, + "learning_rate": 5.607471904424421e-05, + "loss": 1.2866, + "step": 8687 + }, + { + "epoch": 2.637122476855365, + "grad_norm": 0.6119000315666199, + "learning_rate": 5.6069656778374004e-05, + "loss": 1.0934, + "step": 8688 + }, + { + "epoch": 2.6374260130520564, + "grad_norm": 0.8798956274986267, + "learning_rate": 5.60645945125038e-05, + "loss": 1.4096, + "step": 8689 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 1.1200183629989624, + "learning_rate": 5.6059532246633594e-05, + "loss": 1.1836, + "step": 8690 + }, + { + "epoch": 2.6380330854454392, + "grad_norm": 0.6121519207954407, + "learning_rate": 5.605446998076339e-05, + "loss": 1.5971, + "step": 8691 + }, + { + "epoch": 2.638336621642131, + "grad_norm": 0.7266345620155334, + "learning_rate": 5.604940771489319e-05, + "loss": 1.5815, + "step": 8692 + }, + { + "epoch": 2.6386401578388226, + "grad_norm": 0.8083154559135437, + "learning_rate": 5.6044345449022985e-05, + "loss": 1.3426, + "step": 8693 + }, + { + "epoch": 2.6389436940355138, + "grad_norm": 0.8204373717308044, + "learning_rate": 5.603928318315278e-05, + "loss": 1.4459, + "step": 8694 + }, + { + "epoch": 2.639247230232205, + "grad_norm": 0.839185357093811, + "learning_rate": 5.6034220917282574e-05, + "loss": 1.3984, + "step": 8695 + }, + { + "epoch": 2.6395507664288966, + "grad_norm": 0.7797434329986572, + "learning_rate": 5.602915865141237e-05, + "loss": 1.3934, + "step": 8696 + }, + { + "epoch": 2.6398543026255883, + "grad_norm": 0.7706314921379089, + "learning_rate": 5.602409638554217e-05, + "loss": 1.5455, + "step": 8697 + }, + { + "epoch": 2.6401578388222795, + "grad_norm": 0.7319291234016418, + "learning_rate": 5.6019034119671966e-05, + "loss": 0.9225, + "step": 8698 + }, + { + "epoch": 2.640461375018971, + "grad_norm": 0.7184723019599915, + "learning_rate": 5.601397185380176e-05, + "loss": 1.6774, + "step": 8699 + }, + { + "epoch": 2.6407649112156624, + "grad_norm": 0.5552166700363159, + "learning_rate": 5.6008909587931555e-05, + "loss": 0.7107, + "step": 8700 + }, + { + "epoch": 2.641068447412354, + "grad_norm": 0.88196861743927, + "learning_rate": 5.600384732206136e-05, + "loss": 1.4091, + "step": 8701 + }, + { + "epoch": 2.6413719836090452, + "grad_norm": 0.9761573076248169, + "learning_rate": 5.599878505619115e-05, + "loss": 0.9132, + "step": 8702 + }, + { + "epoch": 2.641675519805737, + "grad_norm": 0.7083683609962463, + "learning_rate": 5.5993722790320946e-05, + "loss": 1.1454, + "step": 8703 + }, + { + "epoch": 2.6419790560024285, + "grad_norm": 0.7368018627166748, + "learning_rate": 5.598866052445074e-05, + "loss": 1.4792, + "step": 8704 + }, + { + "epoch": 2.6422825921991198, + "grad_norm": 0.7450541257858276, + "learning_rate": 5.5983598258580536e-05, + "loss": 1.3225, + "step": 8705 + }, + { + "epoch": 2.642586128395811, + "grad_norm": 0.7936186194419861, + "learning_rate": 5.597853599271034e-05, + "loss": 1.4453, + "step": 8706 + }, + { + "epoch": 2.6428896645925026, + "grad_norm": 0.7664183378219604, + "learning_rate": 5.597347372684013e-05, + "loss": 1.3638, + "step": 8707 + }, + { + "epoch": 2.6431932007891943, + "grad_norm": 0.8624455332756042, + "learning_rate": 5.596841146096994e-05, + "loss": 1.0626, + "step": 8708 + }, + { + "epoch": 2.6434967369858855, + "grad_norm": 0.7933397889137268, + "learning_rate": 5.5963349195099735e-05, + "loss": 1.3289, + "step": 8709 + }, + { + "epoch": 2.643800273182577, + "grad_norm": 0.8627804517745972, + "learning_rate": 5.595828692922953e-05, + "loss": 1.4403, + "step": 8710 + }, + { + "epoch": 2.6441038093792684, + "grad_norm": 0.9213400483131409, + "learning_rate": 5.5953224663359325e-05, + "loss": 1.1033, + "step": 8711 + }, + { + "epoch": 2.64440734557596, + "grad_norm": 0.8947356939315796, + "learning_rate": 5.5948162397489127e-05, + "loss": 1.4111, + "step": 8712 + }, + { + "epoch": 2.6447108817726512, + "grad_norm": 0.8398584127426147, + "learning_rate": 5.594310013161892e-05, + "loss": 1.4132, + "step": 8713 + }, + { + "epoch": 2.645014417969343, + "grad_norm": 0.6768031120300293, + "learning_rate": 5.5938037865748716e-05, + "loss": 1.4414, + "step": 8714 + }, + { + "epoch": 2.6453179541660345, + "grad_norm": 0.6541370749473572, + "learning_rate": 5.593297559987851e-05, + "loss": 1.0148, + "step": 8715 + }, + { + "epoch": 2.6456214903627258, + "grad_norm": 0.7286548614501953, + "learning_rate": 5.5927913334008306e-05, + "loss": 1.0437, + "step": 8716 + }, + { + "epoch": 2.645925026559417, + "grad_norm": 0.8829400539398193, + "learning_rate": 5.592285106813811e-05, + "loss": 1.2994, + "step": 8717 + }, + { + "epoch": 2.6462285627561086, + "grad_norm": 0.7351943850517273, + "learning_rate": 5.59177888022679e-05, + "loss": 1.3467, + "step": 8718 + }, + { + "epoch": 2.6465320989528003, + "grad_norm": 0.6213396787643433, + "learning_rate": 5.59127265363977e-05, + "loss": 1.6852, + "step": 8719 + }, + { + "epoch": 2.6468356351494915, + "grad_norm": 0.8491136431694031, + "learning_rate": 5.590766427052749e-05, + "loss": 1.1302, + "step": 8720 + }, + { + "epoch": 2.647139171346183, + "grad_norm": 0.6281788349151611, + "learning_rate": 5.5902602004657287e-05, + "loss": 1.8956, + "step": 8721 + }, + { + "epoch": 2.6474427075428744, + "grad_norm": 0.839736819267273, + "learning_rate": 5.589753973878709e-05, + "loss": 1.224, + "step": 8722 + }, + { + "epoch": 2.647746243739566, + "grad_norm": 0.6985284090042114, + "learning_rate": 5.589247747291688e-05, + "loss": 1.585, + "step": 8723 + }, + { + "epoch": 2.6480497799362572, + "grad_norm": 0.7261654138565063, + "learning_rate": 5.588741520704668e-05, + "loss": 1.3567, + "step": 8724 + }, + { + "epoch": 2.648353316132949, + "grad_norm": 0.8689409494400024, + "learning_rate": 5.588235294117647e-05, + "loss": 1.4018, + "step": 8725 + }, + { + "epoch": 2.6486568523296405, + "grad_norm": 0.9367418885231018, + "learning_rate": 5.5877290675306274e-05, + "loss": 1.362, + "step": 8726 + }, + { + "epoch": 2.6489603885263318, + "grad_norm": 0.8631361126899719, + "learning_rate": 5.587222840943607e-05, + "loss": 1.355, + "step": 8727 + }, + { + "epoch": 2.649263924723023, + "grad_norm": 0.8527751564979553, + "learning_rate": 5.5867166143565864e-05, + "loss": 1.1807, + "step": 8728 + }, + { + "epoch": 2.6495674609197146, + "grad_norm": 0.7197252511978149, + "learning_rate": 5.586210387769566e-05, + "loss": 1.5399, + "step": 8729 + }, + { + "epoch": 2.6498709971164063, + "grad_norm": 0.7717894315719604, + "learning_rate": 5.585704161182545e-05, + "loss": 0.8643, + "step": 8730 + }, + { + "epoch": 2.6501745333130975, + "grad_norm": 0.5923122763633728, + "learning_rate": 5.5851979345955255e-05, + "loss": 0.9277, + "step": 8731 + }, + { + "epoch": 2.650478069509789, + "grad_norm": 0.8007644414901733, + "learning_rate": 5.584691708008505e-05, + "loss": 1.2303, + "step": 8732 + }, + { + "epoch": 2.6507816057064804, + "grad_norm": 0.9415502548217773, + "learning_rate": 5.5841854814214844e-05, + "loss": 1.2438, + "step": 8733 + }, + { + "epoch": 2.651085141903172, + "grad_norm": 0.7225777506828308, + "learning_rate": 5.583679254834464e-05, + "loss": 1.5995, + "step": 8734 + }, + { + "epoch": 2.6513886780998632, + "grad_norm": 0.8504562377929688, + "learning_rate": 5.5831730282474434e-05, + "loss": 1.46, + "step": 8735 + }, + { + "epoch": 2.651692214296555, + "grad_norm": 0.8856134414672852, + "learning_rate": 5.5826668016604236e-05, + "loss": 1.2693, + "step": 8736 + }, + { + "epoch": 2.6519957504932465, + "grad_norm": 0.5905324220657349, + "learning_rate": 5.582160575073403e-05, + "loss": 1.2373, + "step": 8737 + }, + { + "epoch": 2.6522992866899378, + "grad_norm": 0.6726984977722168, + "learning_rate": 5.5816543484863825e-05, + "loss": 1.5899, + "step": 8738 + }, + { + "epoch": 2.652602822886629, + "grad_norm": 0.7494386434555054, + "learning_rate": 5.581148121899362e-05, + "loss": 1.3021, + "step": 8739 + }, + { + "epoch": 2.6529063590833206, + "grad_norm": 0.7558499574661255, + "learning_rate": 5.580641895312342e-05, + "loss": 1.1097, + "step": 8740 + }, + { + "epoch": 2.6532098952800123, + "grad_norm": 0.8065043687820435, + "learning_rate": 5.5801356687253216e-05, + "loss": 1.385, + "step": 8741 + }, + { + "epoch": 2.6535134314767035, + "grad_norm": 0.9156901836395264, + "learning_rate": 5.579629442138301e-05, + "loss": 1.2601, + "step": 8742 + }, + { + "epoch": 2.653816967673395, + "grad_norm": 0.8504417538642883, + "learning_rate": 5.5791232155512806e-05, + "loss": 1.107, + "step": 8743 + }, + { + "epoch": 2.6541205038700864, + "grad_norm": 0.7907352447509766, + "learning_rate": 5.57861698896426e-05, + "loss": 1.5306, + "step": 8744 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.7265666127204895, + "learning_rate": 5.57811076237724e-05, + "loss": 1.4732, + "step": 8745 + }, + { + "epoch": 2.6547275762634692, + "grad_norm": 0.7959305644035339, + "learning_rate": 5.57760453579022e-05, + "loss": 1.1562, + "step": 8746 + }, + { + "epoch": 2.655031112460161, + "grad_norm": 0.8458207249641418, + "learning_rate": 5.577098309203199e-05, + "loss": 0.9998, + "step": 8747 + }, + { + "epoch": 2.6553346486568525, + "grad_norm": 0.8517472743988037, + "learning_rate": 5.576592082616179e-05, + "loss": 1.342, + "step": 8748 + }, + { + "epoch": 2.6556381848535437, + "grad_norm": 0.8582330942153931, + "learning_rate": 5.576085856029158e-05, + "loss": 1.805, + "step": 8749 + }, + { + "epoch": 2.6559417210502354, + "grad_norm": 0.8525375723838806, + "learning_rate": 5.575579629442138e-05, + "loss": 1.4115, + "step": 8750 + }, + { + "epoch": 2.6562452572469266, + "grad_norm": 0.8539319038391113, + "learning_rate": 5.575073402855118e-05, + "loss": 1.4829, + "step": 8751 + }, + { + "epoch": 2.6565487934436183, + "grad_norm": 0.9488011598587036, + "learning_rate": 5.574567176268097e-05, + "loss": 1.0497, + "step": 8752 + }, + { + "epoch": 2.6568523296403095, + "grad_norm": 0.8719534873962402, + "learning_rate": 5.574060949681078e-05, + "loss": 1.3976, + "step": 8753 + }, + { + "epoch": 2.657155865837001, + "grad_norm": 0.8704327344894409, + "learning_rate": 5.5735547230940576e-05, + "loss": 1.4135, + "step": 8754 + }, + { + "epoch": 2.6574594020336924, + "grad_norm": 0.6540507078170776, + "learning_rate": 5.573048496507037e-05, + "loss": 1.4733, + "step": 8755 + }, + { + "epoch": 2.657762938230384, + "grad_norm": 1.0412238836288452, + "learning_rate": 5.572542269920017e-05, + "loss": 1.2556, + "step": 8756 + }, + { + "epoch": 2.658066474427075, + "grad_norm": 0.6366206407546997, + "learning_rate": 5.572036043332997e-05, + "loss": 0.992, + "step": 8757 + }, + { + "epoch": 2.658370010623767, + "grad_norm": 0.7455922961235046, + "learning_rate": 5.571529816745976e-05, + "loss": 1.4857, + "step": 8758 + }, + { + "epoch": 2.6586735468204585, + "grad_norm": 0.7818800806999207, + "learning_rate": 5.5710235901589556e-05, + "loss": 1.3079, + "step": 8759 + }, + { + "epoch": 2.6589770830171497, + "grad_norm": 0.6763079762458801, + "learning_rate": 5.570517363571935e-05, + "loss": 1.4807, + "step": 8760 + }, + { + "epoch": 2.6592806192138414, + "grad_norm": 0.9120274186134338, + "learning_rate": 5.570011136984915e-05, + "loss": 1.3967, + "step": 8761 + }, + { + "epoch": 2.6595841554105326, + "grad_norm": 0.8004521727561951, + "learning_rate": 5.569504910397895e-05, + "loss": 1.2554, + "step": 8762 + }, + { + "epoch": 2.6598876916072243, + "grad_norm": 0.7720287442207336, + "learning_rate": 5.568998683810874e-05, + "loss": 1.4389, + "step": 8763 + }, + { + "epoch": 2.6601912278039155, + "grad_norm": 1.0542229413986206, + "learning_rate": 5.568492457223854e-05, + "loss": 1.1365, + "step": 8764 + }, + { + "epoch": 2.660494764000607, + "grad_norm": 0.6890188455581665, + "learning_rate": 5.567986230636834e-05, + "loss": 1.485, + "step": 8765 + }, + { + "epoch": 2.660798300197299, + "grad_norm": 0.7676848769187927, + "learning_rate": 5.5674800040498134e-05, + "loss": 1.3161, + "step": 8766 + }, + { + "epoch": 2.66110183639399, + "grad_norm": 0.8541175127029419, + "learning_rate": 5.566973777462793e-05, + "loss": 1.1301, + "step": 8767 + }, + { + "epoch": 2.661405372590681, + "grad_norm": 1.1530959606170654, + "learning_rate": 5.566467550875772e-05, + "loss": 1.4562, + "step": 8768 + }, + { + "epoch": 2.661708908787373, + "grad_norm": 0.7591552734375, + "learning_rate": 5.565961324288752e-05, + "loss": 1.1896, + "step": 8769 + }, + { + "epoch": 2.6620124449840645, + "grad_norm": 1.0255539417266846, + "learning_rate": 5.565455097701732e-05, + "loss": 1.2768, + "step": 8770 + }, + { + "epoch": 2.6623159811807557, + "grad_norm": 0.7130913734436035, + "learning_rate": 5.5649488711147114e-05, + "loss": 1.2036, + "step": 8771 + }, + { + "epoch": 2.6626195173774474, + "grad_norm": 0.8823549747467041, + "learning_rate": 5.564442644527691e-05, + "loss": 1.3412, + "step": 8772 + }, + { + "epoch": 2.6629230535741386, + "grad_norm": 0.7604090571403503, + "learning_rate": 5.5639364179406704e-05, + "loss": 0.9941, + "step": 8773 + }, + { + "epoch": 2.6632265897708303, + "grad_norm": 0.8101242184638977, + "learning_rate": 5.56343019135365e-05, + "loss": 1.5021, + "step": 8774 + }, + { + "epoch": 2.6635301259675215, + "grad_norm": 0.7834902405738831, + "learning_rate": 5.56292396476663e-05, + "loss": 1.403, + "step": 8775 + }, + { + "epoch": 2.663833662164213, + "grad_norm": 0.5914158225059509, + "learning_rate": 5.5624177381796095e-05, + "loss": 1.3118, + "step": 8776 + }, + { + "epoch": 2.664137198360905, + "grad_norm": 0.7818302512168884, + "learning_rate": 5.561911511592589e-05, + "loss": 1.5593, + "step": 8777 + }, + { + "epoch": 2.664440734557596, + "grad_norm": 0.8937379717826843, + "learning_rate": 5.5614052850055685e-05, + "loss": 1.1552, + "step": 8778 + }, + { + "epoch": 2.664744270754287, + "grad_norm": 0.916081964969635, + "learning_rate": 5.5608990584185486e-05, + "loss": 1.4133, + "step": 8779 + }, + { + "epoch": 2.665047806950979, + "grad_norm": 0.9175965785980225, + "learning_rate": 5.560392831831528e-05, + "loss": 1.5523, + "step": 8780 + }, + { + "epoch": 2.6653513431476705, + "grad_norm": 0.7606849074363708, + "learning_rate": 5.5598866052445076e-05, + "loss": 1.5572, + "step": 8781 + }, + { + "epoch": 2.6656548793443617, + "grad_norm": 0.8044320344924927, + "learning_rate": 5.559380378657487e-05, + "loss": 1.2658, + "step": 8782 + }, + { + "epoch": 2.6659584155410534, + "grad_norm": 0.5691137909889221, + "learning_rate": 5.5588741520704665e-05, + "loss": 1.0176, + "step": 8783 + }, + { + "epoch": 2.6662619517377446, + "grad_norm": 0.5771634578704834, + "learning_rate": 5.558367925483447e-05, + "loss": 1.2895, + "step": 8784 + }, + { + "epoch": 2.6665654879344363, + "grad_norm": 0.7913541197776794, + "learning_rate": 5.557861698896426e-05, + "loss": 1.1218, + "step": 8785 + }, + { + "epoch": 2.6668690241311275, + "grad_norm": 0.8380897045135498, + "learning_rate": 5.5573554723094057e-05, + "loss": 1.478, + "step": 8786 + }, + { + "epoch": 2.667172560327819, + "grad_norm": 0.8673321008682251, + "learning_rate": 5.556849245722385e-05, + "loss": 1.3437, + "step": 8787 + }, + { + "epoch": 2.667476096524511, + "grad_norm": 0.9138043522834778, + "learning_rate": 5.5563430191353646e-05, + "loss": 1.1263, + "step": 8788 + }, + { + "epoch": 2.667779632721202, + "grad_norm": 0.8235269784927368, + "learning_rate": 5.555836792548345e-05, + "loss": 1.4769, + "step": 8789 + }, + { + "epoch": 2.668083168917893, + "grad_norm": 1.1817772388458252, + "learning_rate": 5.555330565961324e-05, + "loss": 1.4067, + "step": 8790 + }, + { + "epoch": 2.668386705114585, + "grad_norm": 0.778167188167572, + "learning_rate": 5.554824339374304e-05, + "loss": 1.3586, + "step": 8791 + }, + { + "epoch": 2.6686902413112765, + "grad_norm": 0.8383827209472656, + "learning_rate": 5.554318112787283e-05, + "loss": 1.1851, + "step": 8792 + }, + { + "epoch": 2.6689937775079677, + "grad_norm": 0.6682489514350891, + "learning_rate": 5.5538118862002634e-05, + "loss": 1.0986, + "step": 8793 + }, + { + "epoch": 2.6692973137046594, + "grad_norm": 0.6931873559951782, + "learning_rate": 5.553305659613243e-05, + "loss": 1.3819, + "step": 8794 + }, + { + "epoch": 2.6696008499013506, + "grad_norm": 0.7875906825065613, + "learning_rate": 5.552799433026222e-05, + "loss": 1.4054, + "step": 8795 + }, + { + "epoch": 2.6699043860980423, + "grad_norm": 0.7748824954032898, + "learning_rate": 5.552293206439202e-05, + "loss": 1.0686, + "step": 8796 + }, + { + "epoch": 2.6702079222947335, + "grad_norm": 0.7019548416137695, + "learning_rate": 5.5517869798521826e-05, + "loss": 1.7516, + "step": 8797 + }, + { + "epoch": 2.670511458491425, + "grad_norm": 0.7278378009796143, + "learning_rate": 5.551280753265162e-05, + "loss": 1.3714, + "step": 8798 + }, + { + "epoch": 2.670814994688117, + "grad_norm": 0.8102428913116455, + "learning_rate": 5.5507745266781416e-05, + "loss": 1.416, + "step": 8799 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 1.1308832168579102, + "learning_rate": 5.550268300091122e-05, + "loss": 1.0275, + "step": 8800 + }, + { + "epoch": 2.671422067081499, + "grad_norm": 0.6818346977233887, + "learning_rate": 5.549762073504101e-05, + "loss": 0.8489, + "step": 8801 + }, + { + "epoch": 2.671725603278191, + "grad_norm": 0.7548572421073914, + "learning_rate": 5.549255846917081e-05, + "loss": 1.2063, + "step": 8802 + }, + { + "epoch": 2.6720291394748825, + "grad_norm": 0.999927818775177, + "learning_rate": 5.54874962033006e-05, + "loss": 1.4178, + "step": 8803 + }, + { + "epoch": 2.6723326756715737, + "grad_norm": 0.7852398157119751, + "learning_rate": 5.5482433937430404e-05, + "loss": 0.9922, + "step": 8804 + }, + { + "epoch": 2.6726362118682654, + "grad_norm": 0.7230637073516846, + "learning_rate": 5.54773716715602e-05, + "loss": 1.1082, + "step": 8805 + }, + { + "epoch": 2.6729397480649566, + "grad_norm": 1.0759989023208618, + "learning_rate": 5.547230940568999e-05, + "loss": 0.7466, + "step": 8806 + }, + { + "epoch": 2.6732432842616483, + "grad_norm": 0.613384485244751, + "learning_rate": 5.546724713981979e-05, + "loss": 1.1378, + "step": 8807 + }, + { + "epoch": 2.6735468204583395, + "grad_norm": 0.8507648706436157, + "learning_rate": 5.546218487394958e-05, + "loss": 0.9715, + "step": 8808 + }, + { + "epoch": 2.673850356655031, + "grad_norm": 0.7430301308631897, + "learning_rate": 5.5457122608079384e-05, + "loss": 1.4762, + "step": 8809 + }, + { + "epoch": 2.674153892851723, + "grad_norm": 0.7480401992797852, + "learning_rate": 5.545206034220918e-05, + "loss": 1.1704, + "step": 8810 + }, + { + "epoch": 2.674457429048414, + "grad_norm": 0.719728410243988, + "learning_rate": 5.5446998076338974e-05, + "loss": 1.191, + "step": 8811 + }, + { + "epoch": 2.674760965245105, + "grad_norm": 0.8107708692550659, + "learning_rate": 5.544193581046877e-05, + "loss": 0.9739, + "step": 8812 + }, + { + "epoch": 2.675064501441797, + "grad_norm": 0.787104070186615, + "learning_rate": 5.5436873544598563e-05, + "loss": 1.4328, + "step": 8813 + }, + { + "epoch": 2.6753680376384885, + "grad_norm": 1.0159579515457153, + "learning_rate": 5.5431811278728365e-05, + "loss": 1.0301, + "step": 8814 + }, + { + "epoch": 2.6756715738351797, + "grad_norm": 0.8945944905281067, + "learning_rate": 5.542674901285816e-05, + "loss": 1.4273, + "step": 8815 + }, + { + "epoch": 2.6759751100318714, + "grad_norm": 0.6963416934013367, + "learning_rate": 5.5421686746987955e-05, + "loss": 1.0696, + "step": 8816 + }, + { + "epoch": 2.6762786462285626, + "grad_norm": 0.9370332956314087, + "learning_rate": 5.541662448111775e-05, + "loss": 1.5392, + "step": 8817 + }, + { + "epoch": 2.6765821824252543, + "grad_norm": 0.947747528553009, + "learning_rate": 5.541156221524755e-05, + "loss": 1.4117, + "step": 8818 + }, + { + "epoch": 2.6768857186219455, + "grad_norm": 0.8421319127082825, + "learning_rate": 5.5406499949377346e-05, + "loss": 1.2237, + "step": 8819 + }, + { + "epoch": 2.677189254818637, + "grad_norm": 0.790768563747406, + "learning_rate": 5.540143768350714e-05, + "loss": 1.5518, + "step": 8820 + }, + { + "epoch": 2.6774927910153288, + "grad_norm": 0.8691953420639038, + "learning_rate": 5.5396375417636935e-05, + "loss": 1.6724, + "step": 8821 + }, + { + "epoch": 2.67779632721202, + "grad_norm": 0.7883888483047485, + "learning_rate": 5.539131315176673e-05, + "loss": 1.5772, + "step": 8822 + }, + { + "epoch": 2.6780998634087116, + "grad_norm": 0.7148094177246094, + "learning_rate": 5.538625088589653e-05, + "loss": 1.4388, + "step": 8823 + }, + { + "epoch": 2.678403399605403, + "grad_norm": 0.8268208503723145, + "learning_rate": 5.5381188620026327e-05, + "loss": 1.1574, + "step": 8824 + }, + { + "epoch": 2.6787069358020945, + "grad_norm": 0.8798223733901978, + "learning_rate": 5.537612635415612e-05, + "loss": 1.1085, + "step": 8825 + }, + { + "epoch": 2.6790104719987857, + "grad_norm": 0.7200629711151123, + "learning_rate": 5.5371064088285916e-05, + "loss": 1.28, + "step": 8826 + }, + { + "epoch": 2.6793140081954774, + "grad_norm": 1.019481897354126, + "learning_rate": 5.536600182241571e-05, + "loss": 0.87, + "step": 8827 + }, + { + "epoch": 2.679617544392169, + "grad_norm": 0.751953125, + "learning_rate": 5.536093955654551e-05, + "loss": 1.5108, + "step": 8828 + }, + { + "epoch": 2.6799210805888602, + "grad_norm": 0.7722558379173279, + "learning_rate": 5.535587729067531e-05, + "loss": 1.2364, + "step": 8829 + }, + { + "epoch": 2.6802246167855515, + "grad_norm": 0.7905706167221069, + "learning_rate": 5.53508150248051e-05, + "loss": 1.4848, + "step": 8830 + }, + { + "epoch": 2.680528152982243, + "grad_norm": 0.7805942296981812, + "learning_rate": 5.53457527589349e-05, + "loss": 1.4817, + "step": 8831 + }, + { + "epoch": 2.6808316891789348, + "grad_norm": 0.6384637951850891, + "learning_rate": 5.53406904930647e-05, + "loss": 1.4018, + "step": 8832 + }, + { + "epoch": 2.681135225375626, + "grad_norm": 0.7865663766860962, + "learning_rate": 5.533562822719449e-05, + "loss": 1.0755, + "step": 8833 + }, + { + "epoch": 2.6814387615723176, + "grad_norm": 0.7390419244766235, + "learning_rate": 5.533056596132429e-05, + "loss": 1.563, + "step": 8834 + }, + { + "epoch": 2.681742297769009, + "grad_norm": 0.8227308392524719, + "learning_rate": 5.532550369545408e-05, + "loss": 1.2486, + "step": 8835 + }, + { + "epoch": 2.6820458339657005, + "grad_norm": 0.6374137997627258, + "learning_rate": 5.532044142958388e-05, + "loss": 0.954, + "step": 8836 + }, + { + "epoch": 2.6823493701623917, + "grad_norm": 0.873408317565918, + "learning_rate": 5.531537916371368e-05, + "loss": 1.2549, + "step": 8837 + }, + { + "epoch": 2.6826529063590834, + "grad_norm": 0.7719585299491882, + "learning_rate": 5.5310316897843474e-05, + "loss": 0.8281, + "step": 8838 + }, + { + "epoch": 2.682956442555775, + "grad_norm": 0.823834240436554, + "learning_rate": 5.530525463197327e-05, + "loss": 1.3755, + "step": 8839 + }, + { + "epoch": 2.6832599787524662, + "grad_norm": 0.8945797085762024, + "learning_rate": 5.5300192366103064e-05, + "loss": 1.4325, + "step": 8840 + }, + { + "epoch": 2.6835635149491575, + "grad_norm": 0.7850164771080017, + "learning_rate": 5.529513010023286e-05, + "loss": 1.2876, + "step": 8841 + }, + { + "epoch": 2.683867051145849, + "grad_norm": 0.6617736220359802, + "learning_rate": 5.529006783436267e-05, + "loss": 1.3647, + "step": 8842 + }, + { + "epoch": 2.6841705873425408, + "grad_norm": 0.639985203742981, + "learning_rate": 5.528500556849247e-05, + "loss": 1.8664, + "step": 8843 + }, + { + "epoch": 2.684474123539232, + "grad_norm": 0.8891111612319946, + "learning_rate": 5.527994330262226e-05, + "loss": 1.3448, + "step": 8844 + }, + { + "epoch": 2.6847776597359236, + "grad_norm": 0.701209306716919, + "learning_rate": 5.527488103675206e-05, + "loss": 1.735, + "step": 8845 + }, + { + "epoch": 2.685081195932615, + "grad_norm": 0.8127440810203552, + "learning_rate": 5.526981877088185e-05, + "loss": 1.4479, + "step": 8846 + }, + { + "epoch": 2.6853847321293065, + "grad_norm": 0.7177948355674744, + "learning_rate": 5.526475650501165e-05, + "loss": 1.4693, + "step": 8847 + }, + { + "epoch": 2.6856882683259977, + "grad_norm": 0.699662446975708, + "learning_rate": 5.525969423914145e-05, + "loss": 1.6241, + "step": 8848 + }, + { + "epoch": 2.6859918045226894, + "grad_norm": 0.7476281523704529, + "learning_rate": 5.5254631973271244e-05, + "loss": 1.2958, + "step": 8849 + }, + { + "epoch": 2.686295340719381, + "grad_norm": 0.7222526669502258, + "learning_rate": 5.524956970740104e-05, + "loss": 1.1927, + "step": 8850 + }, + { + "epoch": 2.6865988769160722, + "grad_norm": 0.7384173274040222, + "learning_rate": 5.5244507441530833e-05, + "loss": 1.5723, + "step": 8851 + }, + { + "epoch": 2.6869024131127635, + "grad_norm": 0.8595612645149231, + "learning_rate": 5.523944517566063e-05, + "loss": 1.4234, + "step": 8852 + }, + { + "epoch": 2.687205949309455, + "grad_norm": 0.6644933819770813, + "learning_rate": 5.523438290979043e-05, + "loss": 1.2558, + "step": 8853 + }, + { + "epoch": 2.6875094855061468, + "grad_norm": 0.7552403211593628, + "learning_rate": 5.5229320643920225e-05, + "loss": 1.5953, + "step": 8854 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.7952251434326172, + "learning_rate": 5.522425837805002e-05, + "loss": 1.3904, + "step": 8855 + }, + { + "epoch": 2.6881165578995296, + "grad_norm": 0.821399450302124, + "learning_rate": 5.5219196112179814e-05, + "loss": 1.3383, + "step": 8856 + }, + { + "epoch": 2.688420094096221, + "grad_norm": 0.7013937830924988, + "learning_rate": 5.5214133846309616e-05, + "loss": 0.915, + "step": 8857 + }, + { + "epoch": 2.6887236302929125, + "grad_norm": 0.775075376033783, + "learning_rate": 5.520907158043941e-05, + "loss": 1.5131, + "step": 8858 + }, + { + "epoch": 2.6890271664896037, + "grad_norm": 0.7775612473487854, + "learning_rate": 5.5204009314569205e-05, + "loss": 1.3165, + "step": 8859 + }, + { + "epoch": 2.6893307026862954, + "grad_norm": 0.6457474231719971, + "learning_rate": 5.5198947048699e-05, + "loss": 1.7376, + "step": 8860 + }, + { + "epoch": 2.689634238882987, + "grad_norm": 0.8071801662445068, + "learning_rate": 5.5193884782828795e-05, + "loss": 1.052, + "step": 8861 + }, + { + "epoch": 2.6899377750796782, + "grad_norm": 0.7840722799301147, + "learning_rate": 5.5188822516958596e-05, + "loss": 1.3915, + "step": 8862 + }, + { + "epoch": 2.6902413112763695, + "grad_norm": 0.7689992189407349, + "learning_rate": 5.518376025108839e-05, + "loss": 1.4764, + "step": 8863 + }, + { + "epoch": 2.690544847473061, + "grad_norm": 0.805749237537384, + "learning_rate": 5.5178697985218186e-05, + "loss": 1.4676, + "step": 8864 + }, + { + "epoch": 2.6908483836697528, + "grad_norm": 0.6873961687088013, + "learning_rate": 5.517363571934798e-05, + "loss": 1.2869, + "step": 8865 + }, + { + "epoch": 2.691151919866444, + "grad_norm": 0.7631956338882446, + "learning_rate": 5.5168573453477776e-05, + "loss": 1.6077, + "step": 8866 + }, + { + "epoch": 2.6914554560631356, + "grad_norm": 0.8834057450294495, + "learning_rate": 5.516351118760758e-05, + "loss": 1.1535, + "step": 8867 + }, + { + "epoch": 2.691758992259827, + "grad_norm": 0.7160691618919373, + "learning_rate": 5.515844892173737e-05, + "loss": 1.3836, + "step": 8868 + }, + { + "epoch": 2.6920625284565185, + "grad_norm": 0.6507584452629089, + "learning_rate": 5.515338665586717e-05, + "loss": 1.7899, + "step": 8869 + }, + { + "epoch": 2.6923660646532097, + "grad_norm": 0.7139139771461487, + "learning_rate": 5.514832438999696e-05, + "loss": 1.4391, + "step": 8870 + }, + { + "epoch": 2.6926696008499014, + "grad_norm": 0.7380549907684326, + "learning_rate": 5.514326212412676e-05, + "loss": 0.706, + "step": 8871 + }, + { + "epoch": 2.692973137046593, + "grad_norm": 0.8420911431312561, + "learning_rate": 5.513819985825656e-05, + "loss": 0.8575, + "step": 8872 + }, + { + "epoch": 2.6932766732432842, + "grad_norm": 0.8003743886947632, + "learning_rate": 5.513313759238635e-05, + "loss": 1.4146, + "step": 8873 + }, + { + "epoch": 2.6935802094399754, + "grad_norm": 0.9504046440124512, + "learning_rate": 5.512807532651615e-05, + "loss": 1.3557, + "step": 8874 + }, + { + "epoch": 2.693883745636667, + "grad_norm": 0.9947271943092346, + "learning_rate": 5.512301306064594e-05, + "loss": 1.1656, + "step": 8875 + }, + { + "epoch": 2.6941872818333588, + "grad_norm": 0.8137509822845459, + "learning_rate": 5.5117950794775744e-05, + "loss": 1.359, + "step": 8876 + }, + { + "epoch": 2.69449081803005, + "grad_norm": 0.703301727771759, + "learning_rate": 5.511288852890554e-05, + "loss": 1.513, + "step": 8877 + }, + { + "epoch": 2.6947943542267416, + "grad_norm": 0.8018040060997009, + "learning_rate": 5.5107826263035334e-05, + "loss": 1.5769, + "step": 8878 + }, + { + "epoch": 2.695097890423433, + "grad_norm": 0.8137522339820862, + "learning_rate": 5.510276399716513e-05, + "loss": 1.4127, + "step": 8879 + }, + { + "epoch": 2.6954014266201245, + "grad_norm": 0.7223002910614014, + "learning_rate": 5.509770173129492e-05, + "loss": 1.0056, + "step": 8880 + }, + { + "epoch": 2.6957049628168157, + "grad_norm": 0.7558143734931946, + "learning_rate": 5.5092639465424725e-05, + "loss": 1.2446, + "step": 8881 + }, + { + "epoch": 2.6960084990135074, + "grad_norm": 0.8840802311897278, + "learning_rate": 5.508757719955452e-05, + "loss": 1.1856, + "step": 8882 + }, + { + "epoch": 2.696312035210199, + "grad_norm": 0.8228915333747864, + "learning_rate": 5.5082514933684314e-05, + "loss": 1.6278, + "step": 8883 + }, + { + "epoch": 2.6966155714068902, + "grad_norm": 0.984533429145813, + "learning_rate": 5.507745266781411e-05, + "loss": 0.7835, + "step": 8884 + }, + { + "epoch": 2.696919107603582, + "grad_norm": 1.7709839344024658, + "learning_rate": 5.507239040194391e-05, + "loss": 1.2948, + "step": 8885 + }, + { + "epoch": 2.697222643800273, + "grad_norm": 0.8224371671676636, + "learning_rate": 5.506732813607371e-05, + "loss": 1.0086, + "step": 8886 + }, + { + "epoch": 2.6975261799969648, + "grad_norm": 0.79408860206604, + "learning_rate": 5.5062265870203514e-05, + "loss": 1.4202, + "step": 8887 + }, + { + "epoch": 2.697829716193656, + "grad_norm": 0.6942019462585449, + "learning_rate": 5.505720360433331e-05, + "loss": 1.1817, + "step": 8888 + }, + { + "epoch": 2.6981332523903476, + "grad_norm": 0.665850818157196, + "learning_rate": 5.50521413384631e-05, + "loss": 1.6194, + "step": 8889 + }, + { + "epoch": 2.6984367885870393, + "grad_norm": 0.9044862389564514, + "learning_rate": 5.50470790725929e-05, + "loss": 1.4179, + "step": 8890 + }, + { + "epoch": 2.6987403247837305, + "grad_norm": 0.8192222118377686, + "learning_rate": 5.504201680672269e-05, + "loss": 1.3351, + "step": 8891 + }, + { + "epoch": 2.6990438609804217, + "grad_norm": 0.8500630259513855, + "learning_rate": 5.5036954540852495e-05, + "loss": 1.3052, + "step": 8892 + }, + { + "epoch": 2.6993473971771134, + "grad_norm": 1.4197098016738892, + "learning_rate": 5.503189227498229e-05, + "loss": 1.0621, + "step": 8893 + }, + { + "epoch": 2.699650933373805, + "grad_norm": 0.7029363512992859, + "learning_rate": 5.5026830009112084e-05, + "loss": 1.4191, + "step": 8894 + }, + { + "epoch": 2.6999544695704962, + "grad_norm": 0.7289296984672546, + "learning_rate": 5.502176774324188e-05, + "loss": 1.273, + "step": 8895 + }, + { + "epoch": 2.700258005767188, + "grad_norm": 0.903388261795044, + "learning_rate": 5.501670547737168e-05, + "loss": 1.3548, + "step": 8896 + }, + { + "epoch": 2.700561541963879, + "grad_norm": 0.7993081212043762, + "learning_rate": 5.5011643211501475e-05, + "loss": 1.1973, + "step": 8897 + }, + { + "epoch": 2.7008650781605708, + "grad_norm": 0.6894220113754272, + "learning_rate": 5.500658094563127e-05, + "loss": 1.2598, + "step": 8898 + }, + { + "epoch": 2.701168614357262, + "grad_norm": 0.5887405872344971, + "learning_rate": 5.5001518679761065e-05, + "loss": 0.7872, + "step": 8899 + }, + { + "epoch": 2.7014721505539536, + "grad_norm": 0.8089455366134644, + "learning_rate": 5.499645641389086e-05, + "loss": 1.3973, + "step": 8900 + }, + { + "epoch": 2.7017756867506453, + "grad_norm": 0.7730278968811035, + "learning_rate": 5.499139414802066e-05, + "loss": 1.5789, + "step": 8901 + }, + { + "epoch": 2.7020792229473365, + "grad_norm": 0.7910978198051453, + "learning_rate": 5.4986331882150456e-05, + "loss": 1.4332, + "step": 8902 + }, + { + "epoch": 2.7023827591440277, + "grad_norm": 0.8123211860656738, + "learning_rate": 5.498126961628025e-05, + "loss": 1.2123, + "step": 8903 + }, + { + "epoch": 2.7026862953407194, + "grad_norm": 0.7342512011528015, + "learning_rate": 5.4976207350410046e-05, + "loss": 1.5431, + "step": 8904 + }, + { + "epoch": 2.702989831537411, + "grad_norm": 0.8220160007476807, + "learning_rate": 5.497114508453984e-05, + "loss": 1.41, + "step": 8905 + }, + { + "epoch": 2.7032933677341022, + "grad_norm": 0.8257995843887329, + "learning_rate": 5.496608281866964e-05, + "loss": 1.323, + "step": 8906 + }, + { + "epoch": 2.703596903930794, + "grad_norm": 0.8199771046638489, + "learning_rate": 5.496102055279944e-05, + "loss": 1.5909, + "step": 8907 + }, + { + "epoch": 2.703900440127485, + "grad_norm": 0.5013839602470398, + "learning_rate": 5.495595828692923e-05, + "loss": 1.252, + "step": 8908 + }, + { + "epoch": 2.7042039763241768, + "grad_norm": 0.6575402021408081, + "learning_rate": 5.4950896021059026e-05, + "loss": 1.3464, + "step": 8909 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.8246828317642212, + "learning_rate": 5.494583375518883e-05, + "loss": 1.2401, + "step": 8910 + }, + { + "epoch": 2.7048110487175596, + "grad_norm": 0.7484584450721741, + "learning_rate": 5.494077148931862e-05, + "loss": 0.9211, + "step": 8911 + }, + { + "epoch": 2.7051145849142513, + "grad_norm": 0.7549577355384827, + "learning_rate": 5.493570922344842e-05, + "loss": 1.1939, + "step": 8912 + }, + { + "epoch": 2.7054181211109425, + "grad_norm": 0.7589015364646912, + "learning_rate": 5.493064695757821e-05, + "loss": 1.1499, + "step": 8913 + }, + { + "epoch": 2.7057216573076337, + "grad_norm": 0.7982020378112793, + "learning_rate": 5.492558469170801e-05, + "loss": 1.3517, + "step": 8914 + }, + { + "epoch": 2.7060251935043254, + "grad_norm": 0.48329707980155945, + "learning_rate": 5.492052242583781e-05, + "loss": 1.2968, + "step": 8915 + }, + { + "epoch": 2.706328729701017, + "grad_norm": 0.9169350862503052, + "learning_rate": 5.4915460159967603e-05, + "loss": 1.2864, + "step": 8916 + }, + { + "epoch": 2.7066322658977082, + "grad_norm": 0.7144566178321838, + "learning_rate": 5.49103978940974e-05, + "loss": 1.4989, + "step": 8917 + }, + { + "epoch": 2.7069358020944, + "grad_norm": 0.9641619324684143, + "learning_rate": 5.490533562822719e-05, + "loss": 1.0338, + "step": 8918 + }, + { + "epoch": 2.707239338291091, + "grad_norm": 0.8105370402336121, + "learning_rate": 5.490027336235699e-05, + "loss": 1.4915, + "step": 8919 + }, + { + "epoch": 2.7075428744877827, + "grad_norm": 0.8228601217269897, + "learning_rate": 5.489521109648679e-05, + "loss": 1.4441, + "step": 8920 + }, + { + "epoch": 2.707846410684474, + "grad_norm": 0.7452234625816345, + "learning_rate": 5.4890148830616584e-05, + "loss": 1.3552, + "step": 8921 + }, + { + "epoch": 2.7081499468811656, + "grad_norm": 0.7853007912635803, + "learning_rate": 5.488508656474638e-05, + "loss": 1.0674, + "step": 8922 + }, + { + "epoch": 2.7084534830778573, + "grad_norm": 0.6740144491195679, + "learning_rate": 5.4880024298876174e-05, + "loss": 1.3517, + "step": 8923 + }, + { + "epoch": 2.7087570192745485, + "grad_norm": 0.739098310470581, + "learning_rate": 5.4874962033005975e-05, + "loss": 1.4536, + "step": 8924 + }, + { + "epoch": 2.7090605554712397, + "grad_norm": 0.8560205101966858, + "learning_rate": 5.486989976713577e-05, + "loss": 1.1199, + "step": 8925 + }, + { + "epoch": 2.7093640916679314, + "grad_norm": 0.6795910000801086, + "learning_rate": 5.4864837501265565e-05, + "loss": 1.4855, + "step": 8926 + }, + { + "epoch": 2.709667627864623, + "grad_norm": 0.7062528133392334, + "learning_rate": 5.485977523539536e-05, + "loss": 1.4793, + "step": 8927 + }, + { + "epoch": 2.709971164061314, + "grad_norm": 0.8141422271728516, + "learning_rate": 5.4854712969525155e-05, + "loss": 1.5419, + "step": 8928 + }, + { + "epoch": 2.710274700258006, + "grad_norm": 0.8198245763778687, + "learning_rate": 5.4849650703654956e-05, + "loss": 1.3353, + "step": 8929 + }, + { + "epoch": 2.710578236454697, + "grad_norm": 0.6655880808830261, + "learning_rate": 5.484458843778475e-05, + "loss": 1.2101, + "step": 8930 + }, + { + "epoch": 2.7108817726513887, + "grad_norm": 0.8227945566177368, + "learning_rate": 5.483952617191456e-05, + "loss": 0.6048, + "step": 8931 + }, + { + "epoch": 2.71118530884808, + "grad_norm": 0.9034178853034973, + "learning_rate": 5.4834463906044354e-05, + "loss": 1.4865, + "step": 8932 + }, + { + "epoch": 2.7114888450447716, + "grad_norm": 0.7349004149436951, + "learning_rate": 5.482940164017415e-05, + "loss": 1.5953, + "step": 8933 + }, + { + "epoch": 2.7117923812414633, + "grad_norm": 0.8077744245529175, + "learning_rate": 5.4824339374303944e-05, + "loss": 1.1701, + "step": 8934 + }, + { + "epoch": 2.7120959174381545, + "grad_norm": 0.7649827599525452, + "learning_rate": 5.4819277108433745e-05, + "loss": 1.5415, + "step": 8935 + }, + { + "epoch": 2.7123994536348457, + "grad_norm": 0.8440228700637817, + "learning_rate": 5.481421484256354e-05, + "loss": 1.1785, + "step": 8936 + }, + { + "epoch": 2.7127029898315373, + "grad_norm": 0.8126983642578125, + "learning_rate": 5.4809152576693335e-05, + "loss": 1.384, + "step": 8937 + }, + { + "epoch": 2.713006526028229, + "grad_norm": 0.8477095365524292, + "learning_rate": 5.480409031082313e-05, + "loss": 1.0354, + "step": 8938 + }, + { + "epoch": 2.71331006222492, + "grad_norm": 0.8486508727073669, + "learning_rate": 5.4799028044952924e-05, + "loss": 0.8829, + "step": 8939 + }, + { + "epoch": 2.713613598421612, + "grad_norm": 0.5038058161735535, + "learning_rate": 5.4793965779082726e-05, + "loss": 1.2798, + "step": 8940 + }, + { + "epoch": 2.713917134618303, + "grad_norm": 0.55999755859375, + "learning_rate": 5.478890351321252e-05, + "loss": 1.2635, + "step": 8941 + }, + { + "epoch": 2.7142206708149947, + "grad_norm": 0.635403573513031, + "learning_rate": 5.4783841247342316e-05, + "loss": 1.1163, + "step": 8942 + }, + { + "epoch": 2.714524207011686, + "grad_norm": 0.7358167767524719, + "learning_rate": 5.477877898147211e-05, + "loss": 0.9021, + "step": 8943 + }, + { + "epoch": 2.7148277432083776, + "grad_norm": 0.8093145489692688, + "learning_rate": 5.4773716715601905e-05, + "loss": 1.1346, + "step": 8944 + }, + { + "epoch": 2.7151312794050693, + "grad_norm": 0.7014288902282715, + "learning_rate": 5.476865444973171e-05, + "loss": 1.5469, + "step": 8945 + }, + { + "epoch": 2.7154348156017605, + "grad_norm": 0.5633707642555237, + "learning_rate": 5.47635921838615e-05, + "loss": 1.5701, + "step": 8946 + }, + { + "epoch": 2.7157383517984517, + "grad_norm": 0.6636736989021301, + "learning_rate": 5.4758529917991296e-05, + "loss": 1.1156, + "step": 8947 + }, + { + "epoch": 2.7160418879951433, + "grad_norm": 0.7108006477355957, + "learning_rate": 5.475346765212109e-05, + "loss": 1.0658, + "step": 8948 + }, + { + "epoch": 2.716345424191835, + "grad_norm": 0.7489403486251831, + "learning_rate": 5.474840538625089e-05, + "loss": 1.3565, + "step": 8949 + }, + { + "epoch": 2.716648960388526, + "grad_norm": 0.95987468957901, + "learning_rate": 5.474334312038069e-05, + "loss": 1.35, + "step": 8950 + }, + { + "epoch": 2.716952496585218, + "grad_norm": 0.857208251953125, + "learning_rate": 5.473828085451048e-05, + "loss": 1.3263, + "step": 8951 + }, + { + "epoch": 2.717256032781909, + "grad_norm": 0.6561353802680969, + "learning_rate": 5.473321858864028e-05, + "loss": 1.3822, + "step": 8952 + }, + { + "epoch": 2.7175595689786007, + "grad_norm": 0.74046391248703, + "learning_rate": 5.472815632277007e-05, + "loss": 1.4686, + "step": 8953 + }, + { + "epoch": 2.717863105175292, + "grad_norm": 0.7804528474807739, + "learning_rate": 5.4723094056899873e-05, + "loss": 1.4059, + "step": 8954 + }, + { + "epoch": 2.7181666413719836, + "grad_norm": 0.6641717553138733, + "learning_rate": 5.471803179102967e-05, + "loss": 0.9086, + "step": 8955 + }, + { + "epoch": 2.7184701775686753, + "grad_norm": 0.4145214855670929, + "learning_rate": 5.471296952515946e-05, + "loss": 0.744, + "step": 8956 + }, + { + "epoch": 2.7187737137653665, + "grad_norm": 0.8224796056747437, + "learning_rate": 5.470790725928926e-05, + "loss": 1.57, + "step": 8957 + }, + { + "epoch": 2.719077249962058, + "grad_norm": 0.9994120597839355, + "learning_rate": 5.470284499341905e-05, + "loss": 1.1499, + "step": 8958 + }, + { + "epoch": 2.7193807861587493, + "grad_norm": 0.7927001118659973, + "learning_rate": 5.4697782727548854e-05, + "loss": 1.413, + "step": 8959 + }, + { + "epoch": 2.719684322355441, + "grad_norm": 0.8578165769577026, + "learning_rate": 5.469272046167865e-05, + "loss": 0.8647, + "step": 8960 + }, + { + "epoch": 2.719987858552132, + "grad_norm": 0.6189905405044556, + "learning_rate": 5.4687658195808444e-05, + "loss": 1.3841, + "step": 8961 + }, + { + "epoch": 2.720291394748824, + "grad_norm": 1.0179373025894165, + "learning_rate": 5.468259592993824e-05, + "loss": 0.5326, + "step": 8962 + }, + { + "epoch": 2.7205949309455155, + "grad_norm": 0.6325749754905701, + "learning_rate": 5.467753366406804e-05, + "loss": 1.7126, + "step": 8963 + }, + { + "epoch": 2.7208984671422067, + "grad_norm": 0.7680848240852356, + "learning_rate": 5.4672471398197835e-05, + "loss": 1.1946, + "step": 8964 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.8343624472618103, + "learning_rate": 5.466740913232763e-05, + "loss": 1.4145, + "step": 8965 + }, + { + "epoch": 2.7215055395355896, + "grad_norm": 0.8505995273590088, + "learning_rate": 5.4662346866457425e-05, + "loss": 0.9683, + "step": 8966 + }, + { + "epoch": 2.7218090757322813, + "grad_norm": 1.0219932794570923, + "learning_rate": 5.465728460058722e-05, + "loss": 1.3269, + "step": 8967 + }, + { + "epoch": 2.7221126119289725, + "grad_norm": 0.7481555342674255, + "learning_rate": 5.465222233471702e-05, + "loss": 1.4979, + "step": 8968 + }, + { + "epoch": 2.722416148125664, + "grad_norm": 0.7058635354042053, + "learning_rate": 5.4647160068846816e-05, + "loss": 1.9171, + "step": 8969 + }, + { + "epoch": 2.7227196843223553, + "grad_norm": 0.8569918274879456, + "learning_rate": 5.464209780297661e-05, + "loss": 1.3167, + "step": 8970 + }, + { + "epoch": 2.723023220519047, + "grad_norm": 0.7231106758117676, + "learning_rate": 5.4637035537106405e-05, + "loss": 1.3654, + "step": 8971 + }, + { + "epoch": 2.723326756715738, + "grad_norm": 0.7587422132492065, + "learning_rate": 5.46319732712362e-05, + "loss": 1.3848, + "step": 8972 + }, + { + "epoch": 2.72363029291243, + "grad_norm": 0.7885164022445679, + "learning_rate": 5.4626911005366e-05, + "loss": 1.3941, + "step": 8973 + }, + { + "epoch": 2.7239338291091215, + "grad_norm": 0.8368237614631653, + "learning_rate": 5.4621848739495796e-05, + "loss": 1.4064, + "step": 8974 + }, + { + "epoch": 2.7242373653058127, + "grad_norm": 0.8279350996017456, + "learning_rate": 5.4616786473625605e-05, + "loss": 1.3322, + "step": 8975 + }, + { + "epoch": 2.724540901502504, + "grad_norm": 0.7650869488716125, + "learning_rate": 5.46117242077554e-05, + "loss": 0.9318, + "step": 8976 + }, + { + "epoch": 2.7248444376991956, + "grad_norm": 0.6454781293869019, + "learning_rate": 5.4606661941885194e-05, + "loss": 0.9596, + "step": 8977 + }, + { + "epoch": 2.7251479738958873, + "grad_norm": 0.7179443836212158, + "learning_rate": 5.460159967601499e-05, + "loss": 1.6231, + "step": 8978 + }, + { + "epoch": 2.7254515100925785, + "grad_norm": 0.9709389805793762, + "learning_rate": 5.459653741014479e-05, + "loss": 1.3583, + "step": 8979 + }, + { + "epoch": 2.72575504628927, + "grad_norm": 0.8201824426651001, + "learning_rate": 5.4591475144274586e-05, + "loss": 1.2885, + "step": 8980 + }, + { + "epoch": 2.7260585824859613, + "grad_norm": 0.7727458477020264, + "learning_rate": 5.458641287840438e-05, + "loss": 1.1555, + "step": 8981 + }, + { + "epoch": 2.726362118682653, + "grad_norm": 0.8874642848968506, + "learning_rate": 5.4581350612534175e-05, + "loss": 1.1881, + "step": 8982 + }, + { + "epoch": 2.726665654879344, + "grad_norm": 1.0157978534698486, + "learning_rate": 5.457628834666397e-05, + "loss": 1.2026, + "step": 8983 + }, + { + "epoch": 2.726969191076036, + "grad_norm": 0.7455587983131409, + "learning_rate": 5.457122608079377e-05, + "loss": 1.1722, + "step": 8984 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.0781230926513672, + "learning_rate": 5.4566163814923566e-05, + "loss": 1.1869, + "step": 8985 + }, + { + "epoch": 2.7275762634694187, + "grad_norm": 0.8218075633049011, + "learning_rate": 5.456110154905336e-05, + "loss": 1.5559, + "step": 8986 + }, + { + "epoch": 2.72787979966611, + "grad_norm": 0.7117857933044434, + "learning_rate": 5.4556039283183156e-05, + "loss": 1.1925, + "step": 8987 + }, + { + "epoch": 2.7281833358628016, + "grad_norm": 0.6879022121429443, + "learning_rate": 5.455097701731296e-05, + "loss": 1.6949, + "step": 8988 + }, + { + "epoch": 2.7284868720594933, + "grad_norm": 0.5837012529373169, + "learning_rate": 5.454591475144275e-05, + "loss": 1.2666, + "step": 8989 + }, + { + "epoch": 2.7287904082561845, + "grad_norm": 0.7494475245475769, + "learning_rate": 5.454085248557255e-05, + "loss": 1.4222, + "step": 8990 + }, + { + "epoch": 2.729093944452876, + "grad_norm": 0.7593268752098083, + "learning_rate": 5.453579021970234e-05, + "loss": 1.3327, + "step": 8991 + }, + { + "epoch": 2.7293974806495673, + "grad_norm": 0.7277660965919495, + "learning_rate": 5.453072795383214e-05, + "loss": 1.652, + "step": 8992 + }, + { + "epoch": 2.729701016846259, + "grad_norm": 0.8330652713775635, + "learning_rate": 5.452566568796194e-05, + "loss": 1.461, + "step": 8993 + }, + { + "epoch": 2.73000455304295, + "grad_norm": 0.7826851606369019, + "learning_rate": 5.452060342209173e-05, + "loss": 1.526, + "step": 8994 + }, + { + "epoch": 2.730308089239642, + "grad_norm": 0.9074543118476868, + "learning_rate": 5.451554115622153e-05, + "loss": 1.181, + "step": 8995 + }, + { + "epoch": 2.7306116254363335, + "grad_norm": 0.8699624538421631, + "learning_rate": 5.451047889035132e-05, + "loss": 1.452, + "step": 8996 + }, + { + "epoch": 2.7309151616330247, + "grad_norm": 0.8349964022636414, + "learning_rate": 5.450541662448112e-05, + "loss": 1.1185, + "step": 8997 + }, + { + "epoch": 2.731218697829716, + "grad_norm": 0.885654628276825, + "learning_rate": 5.450035435861092e-05, + "loss": 0.9048, + "step": 8998 + }, + { + "epoch": 2.7315222340264076, + "grad_norm": 0.906273365020752, + "learning_rate": 5.4495292092740714e-05, + "loss": 0.9559, + "step": 8999 + }, + { + "epoch": 2.7318257702230992, + "grad_norm": 0.8029773831367493, + "learning_rate": 5.449022982687051e-05, + "loss": 1.2853, + "step": 9000 + }, + { + "epoch": 2.7321293064197905, + "grad_norm": 0.7525988817214966, + "learning_rate": 5.44851675610003e-05, + "loss": 1.0268, + "step": 9001 + }, + { + "epoch": 2.732432842616482, + "grad_norm": 0.655807375907898, + "learning_rate": 5.4480105295130105e-05, + "loss": 1.5585, + "step": 9002 + }, + { + "epoch": 2.7327363788131733, + "grad_norm": 0.8269489407539368, + "learning_rate": 5.44750430292599e-05, + "loss": 1.6588, + "step": 9003 + }, + { + "epoch": 2.733039915009865, + "grad_norm": 0.6987342238426208, + "learning_rate": 5.4469980763389694e-05, + "loss": 0.9659, + "step": 9004 + }, + { + "epoch": 2.733343451206556, + "grad_norm": 0.6727113127708435, + "learning_rate": 5.446491849751949e-05, + "loss": 1.516, + "step": 9005 + }, + { + "epoch": 2.733646987403248, + "grad_norm": 0.9753486514091492, + "learning_rate": 5.4459856231649284e-05, + "loss": 1.5715, + "step": 9006 + }, + { + "epoch": 2.7339505235999395, + "grad_norm": 0.6524636745452881, + "learning_rate": 5.4454793965779086e-05, + "loss": 1.36, + "step": 9007 + }, + { + "epoch": 2.7342540597966307, + "grad_norm": 0.8173258900642395, + "learning_rate": 5.444973169990888e-05, + "loss": 1.0862, + "step": 9008 + }, + { + "epoch": 2.734557595993322, + "grad_norm": 0.8795118927955627, + "learning_rate": 5.4444669434038675e-05, + "loss": 1.448, + "step": 9009 + }, + { + "epoch": 2.7348611321900136, + "grad_norm": 0.9424416422843933, + "learning_rate": 5.443960716816847e-05, + "loss": 1.2404, + "step": 9010 + }, + { + "epoch": 2.7351646683867052, + "grad_norm": 0.8112708926200867, + "learning_rate": 5.4434544902298265e-05, + "loss": 1.4256, + "step": 9011 + }, + { + "epoch": 2.7354682045833965, + "grad_norm": 0.7930210828781128, + "learning_rate": 5.4429482636428066e-05, + "loss": 1.5032, + "step": 9012 + }, + { + "epoch": 2.735771740780088, + "grad_norm": 0.8505734801292419, + "learning_rate": 5.442442037055786e-05, + "loss": 0.813, + "step": 9013 + }, + { + "epoch": 2.7360752769767793, + "grad_norm": 0.676701009273529, + "learning_rate": 5.4419358104687656e-05, + "loss": 1.4157, + "step": 9014 + }, + { + "epoch": 2.736378813173471, + "grad_norm": 0.9086753726005554, + "learning_rate": 5.441429583881745e-05, + "loss": 1.3663, + "step": 9015 + }, + { + "epoch": 2.736682349370162, + "grad_norm": 0.8363310098648071, + "learning_rate": 5.440923357294725e-05, + "loss": 1.5283, + "step": 9016 + }, + { + "epoch": 2.736985885566854, + "grad_norm": 0.9008285403251648, + "learning_rate": 5.440417130707705e-05, + "loss": 1.2352, + "step": 9017 + }, + { + "epoch": 2.7372894217635455, + "grad_norm": 0.6482596397399902, + "learning_rate": 5.439910904120684e-05, + "loss": 1.7285, + "step": 9018 + }, + { + "epoch": 2.7375929579602367, + "grad_norm": 0.7748271226882935, + "learning_rate": 5.439404677533664e-05, + "loss": 1.4282, + "step": 9019 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.7706668376922607, + "learning_rate": 5.4388984509466445e-05, + "loss": 1.4724, + "step": 9020 + }, + { + "epoch": 2.7382000303536196, + "grad_norm": 0.8214789032936096, + "learning_rate": 5.438392224359624e-05, + "loss": 1.3454, + "step": 9021 + }, + { + "epoch": 2.7385035665503112, + "grad_norm": 0.6393706202507019, + "learning_rate": 5.4378859977726035e-05, + "loss": 1.268, + "step": 9022 + }, + { + "epoch": 2.7388071027470025, + "grad_norm": 0.6789675951004028, + "learning_rate": 5.4373797711855836e-05, + "loss": 1.4618, + "step": 9023 + }, + { + "epoch": 2.739110638943694, + "grad_norm": 0.872805655002594, + "learning_rate": 5.436873544598563e-05, + "loss": 1.3131, + "step": 9024 + }, + { + "epoch": 2.7394141751403858, + "grad_norm": 0.5699925422668457, + "learning_rate": 5.4363673180115426e-05, + "loss": 0.9571, + "step": 9025 + }, + { + "epoch": 2.739717711337077, + "grad_norm": 0.7801041603088379, + "learning_rate": 5.435861091424522e-05, + "loss": 1.4479, + "step": 9026 + }, + { + "epoch": 2.740021247533768, + "grad_norm": 0.6360588669776917, + "learning_rate": 5.435354864837502e-05, + "loss": 0.355, + "step": 9027 + }, + { + "epoch": 2.74032478373046, + "grad_norm": 0.7122807502746582, + "learning_rate": 5.434848638250482e-05, + "loss": 1.3132, + "step": 9028 + }, + { + "epoch": 2.7406283199271515, + "grad_norm": 0.7230129837989807, + "learning_rate": 5.434342411663461e-05, + "loss": 0.7899, + "step": 9029 + }, + { + "epoch": 2.7409318561238427, + "grad_norm": 0.7373469471931458, + "learning_rate": 5.4338361850764407e-05, + "loss": 1.3252, + "step": 9030 + }, + { + "epoch": 2.7412353923205344, + "grad_norm": 0.7642449140548706, + "learning_rate": 5.43332995848942e-05, + "loss": 1.4505, + "step": 9031 + }, + { + "epoch": 2.7415389285172256, + "grad_norm": 0.8837107419967651, + "learning_rate": 5.4328237319024e-05, + "loss": 1.211, + "step": 9032 + }, + { + "epoch": 2.7418424647139172, + "grad_norm": 0.6932831406593323, + "learning_rate": 5.43231750531538e-05, + "loss": 1.2442, + "step": 9033 + }, + { + "epoch": 2.7421460009106084, + "grad_norm": 0.7030109167098999, + "learning_rate": 5.431811278728359e-05, + "loss": 1.6409, + "step": 9034 + }, + { + "epoch": 2.7424495371073, + "grad_norm": 1.1428993940353394, + "learning_rate": 5.431305052141339e-05, + "loss": 1.1862, + "step": 9035 + }, + { + "epoch": 2.7427530733039918, + "grad_norm": 1.2733973264694214, + "learning_rate": 5.430798825554318e-05, + "loss": 1.3141, + "step": 9036 + }, + { + "epoch": 2.743056609500683, + "grad_norm": 0.7428387403488159, + "learning_rate": 5.4302925989672984e-05, + "loss": 0.8509, + "step": 9037 + }, + { + "epoch": 2.743360145697374, + "grad_norm": 0.7533442974090576, + "learning_rate": 5.429786372380278e-05, + "loss": 1.3576, + "step": 9038 + }, + { + "epoch": 2.743663681894066, + "grad_norm": 0.8623830080032349, + "learning_rate": 5.429280145793257e-05, + "loss": 1.1348, + "step": 9039 + }, + { + "epoch": 2.7439672180907575, + "grad_norm": 0.7006097435951233, + "learning_rate": 5.428773919206237e-05, + "loss": 1.1323, + "step": 9040 + }, + { + "epoch": 2.7442707542874487, + "grad_norm": 0.6233174800872803, + "learning_rate": 5.428267692619217e-05, + "loss": 1.5301, + "step": 9041 + }, + { + "epoch": 2.7445742904841404, + "grad_norm": 0.8230398893356323, + "learning_rate": 5.4277614660321964e-05, + "loss": 1.231, + "step": 9042 + }, + { + "epoch": 2.7448778266808316, + "grad_norm": 0.7485307455062866, + "learning_rate": 5.427255239445176e-05, + "loss": 1.0382, + "step": 9043 + }, + { + "epoch": 2.7451813628775232, + "grad_norm": 1.0905840396881104, + "learning_rate": 5.4267490128581554e-05, + "loss": 1.2776, + "step": 9044 + }, + { + "epoch": 2.7454848990742144, + "grad_norm": 0.8306492567062378, + "learning_rate": 5.426242786271135e-05, + "loss": 1.3409, + "step": 9045 + }, + { + "epoch": 2.745788435270906, + "grad_norm": 0.7774491906166077, + "learning_rate": 5.425736559684115e-05, + "loss": 1.5055, + "step": 9046 + }, + { + "epoch": 2.7460919714675978, + "grad_norm": 0.8531390428543091, + "learning_rate": 5.4252303330970945e-05, + "loss": 1.1088, + "step": 9047 + }, + { + "epoch": 2.746395507664289, + "grad_norm": 0.8016642332077026, + "learning_rate": 5.424724106510074e-05, + "loss": 1.3492, + "step": 9048 + }, + { + "epoch": 2.74669904386098, + "grad_norm": 0.5586883425712585, + "learning_rate": 5.4242178799230535e-05, + "loss": 1.1221, + "step": 9049 + }, + { + "epoch": 2.747002580057672, + "grad_norm": 0.8126424551010132, + "learning_rate": 5.423711653336033e-05, + "loss": 1.298, + "step": 9050 + }, + { + "epoch": 2.7473061162543635, + "grad_norm": 0.8375364542007446, + "learning_rate": 5.423205426749013e-05, + "loss": 1.2794, + "step": 9051 + }, + { + "epoch": 2.7476096524510547, + "grad_norm": 0.8436753749847412, + "learning_rate": 5.4226992001619926e-05, + "loss": 1.4027, + "step": 9052 + }, + { + "epoch": 2.7479131886477464, + "grad_norm": 0.681559145450592, + "learning_rate": 5.422192973574972e-05, + "loss": 1.491, + "step": 9053 + }, + { + "epoch": 2.7482167248444376, + "grad_norm": 0.7104101777076721, + "learning_rate": 5.4216867469879516e-05, + "loss": 1.5193, + "step": 9054 + }, + { + "epoch": 2.7485202610411292, + "grad_norm": 0.8452191948890686, + "learning_rate": 5.421180520400932e-05, + "loss": 1.0921, + "step": 9055 + }, + { + "epoch": 2.7488237972378204, + "grad_norm": 0.7584163546562195, + "learning_rate": 5.420674293813911e-05, + "loss": 1.6706, + "step": 9056 + }, + { + "epoch": 2.749127333434512, + "grad_norm": 0.6813714504241943, + "learning_rate": 5.420168067226891e-05, + "loss": 1.7236, + "step": 9057 + }, + { + "epoch": 2.7494308696312038, + "grad_norm": 0.9535608887672424, + "learning_rate": 5.41966184063987e-05, + "loss": 1.4805, + "step": 9058 + }, + { + "epoch": 2.749734405827895, + "grad_norm": 0.6872257590293884, + "learning_rate": 5.4191556140528496e-05, + "loss": 1.7005, + "step": 9059 + }, + { + "epoch": 2.750037942024586, + "grad_norm": 0.8195981383323669, + "learning_rate": 5.41864938746583e-05, + "loss": 1.2913, + "step": 9060 + }, + { + "epoch": 2.750341478221278, + "grad_norm": 0.8541540503501892, + "learning_rate": 5.418143160878809e-05, + "loss": 1.4657, + "step": 9061 + }, + { + "epoch": 2.7506450144179695, + "grad_norm": 0.8466823101043701, + "learning_rate": 5.417636934291789e-05, + "loss": 1.0107, + "step": 9062 + }, + { + "epoch": 2.7509485506146607, + "grad_norm": 0.7837762832641602, + "learning_rate": 5.417130707704768e-05, + "loss": 1.5619, + "step": 9063 + }, + { + "epoch": 2.7512520868113524, + "grad_norm": 0.7531890869140625, + "learning_rate": 5.416624481117749e-05, + "loss": 1.2658, + "step": 9064 + }, + { + "epoch": 2.7515556230080436, + "grad_norm": 0.6701220273971558, + "learning_rate": 5.4161182545307285e-05, + "loss": 1.1735, + "step": 9065 + }, + { + "epoch": 2.7518591592047352, + "grad_norm": 0.8095039129257202, + "learning_rate": 5.415612027943709e-05, + "loss": 1.7546, + "step": 9066 + }, + { + "epoch": 2.7521626954014264, + "grad_norm": 0.8936877250671387, + "learning_rate": 5.415105801356688e-05, + "loss": 0.9218, + "step": 9067 + }, + { + "epoch": 2.752466231598118, + "grad_norm": 0.729854166507721, + "learning_rate": 5.4145995747696677e-05, + "loss": 1.5037, + "step": 9068 + }, + { + "epoch": 2.7527697677948098, + "grad_norm": 0.6929486393928528, + "learning_rate": 5.414093348182647e-05, + "loss": 1.4376, + "step": 9069 + }, + { + "epoch": 2.753073303991501, + "grad_norm": 0.6936506628990173, + "learning_rate": 5.4135871215956266e-05, + "loss": 1.5083, + "step": 9070 + }, + { + "epoch": 2.753376840188192, + "grad_norm": 0.7572258114814758, + "learning_rate": 5.413080895008607e-05, + "loss": 1.5286, + "step": 9071 + }, + { + "epoch": 2.753680376384884, + "grad_norm": 0.9175770282745361, + "learning_rate": 5.412574668421586e-05, + "loss": 1.1926, + "step": 9072 + }, + { + "epoch": 2.7539839125815755, + "grad_norm": 0.681115984916687, + "learning_rate": 5.412068441834566e-05, + "loss": 1.4447, + "step": 9073 + }, + { + "epoch": 2.7542874487782667, + "grad_norm": 0.8459120988845825, + "learning_rate": 5.411562215247545e-05, + "loss": 1.6663, + "step": 9074 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.796818733215332, + "learning_rate": 5.411055988660525e-05, + "loss": 0.9759, + "step": 9075 + }, + { + "epoch": 2.7548945211716496, + "grad_norm": 0.9406018853187561, + "learning_rate": 5.410549762073505e-05, + "loss": 1.4791, + "step": 9076 + }, + { + "epoch": 2.7551980573683412, + "grad_norm": 0.7923262715339661, + "learning_rate": 5.410043535486484e-05, + "loss": 1.6416, + "step": 9077 + }, + { + "epoch": 2.7555015935650324, + "grad_norm": 0.6868494749069214, + "learning_rate": 5.409537308899464e-05, + "loss": 1.0971, + "step": 9078 + }, + { + "epoch": 2.755805129761724, + "grad_norm": 0.6387277245521545, + "learning_rate": 5.409031082312443e-05, + "loss": 1.3577, + "step": 9079 + }, + { + "epoch": 2.7561086659584157, + "grad_norm": 0.7907262444496155, + "learning_rate": 5.4085248557254234e-05, + "loss": 1.6351, + "step": 9080 + }, + { + "epoch": 2.756412202155107, + "grad_norm": 0.8075240850448608, + "learning_rate": 5.408018629138403e-05, + "loss": 1.2531, + "step": 9081 + }, + { + "epoch": 2.7567157383517986, + "grad_norm": 0.7291090488433838, + "learning_rate": 5.4075124025513824e-05, + "loss": 1.5374, + "step": 9082 + }, + { + "epoch": 2.75701927454849, + "grad_norm": 0.8079223036766052, + "learning_rate": 5.407006175964362e-05, + "loss": 1.5611, + "step": 9083 + }, + { + "epoch": 2.7573228107451815, + "grad_norm": 0.6828078031539917, + "learning_rate": 5.4064999493773414e-05, + "loss": 1.2459, + "step": 9084 + }, + { + "epoch": 2.7576263469418727, + "grad_norm": 0.8029616475105286, + "learning_rate": 5.4059937227903215e-05, + "loss": 0.7535, + "step": 9085 + }, + { + "epoch": 2.7579298831385644, + "grad_norm": 0.7968683242797852, + "learning_rate": 5.405487496203301e-05, + "loss": 1.4482, + "step": 9086 + }, + { + "epoch": 2.7582334193352556, + "grad_norm": 0.6600356698036194, + "learning_rate": 5.4049812696162805e-05, + "loss": 1.0617, + "step": 9087 + }, + { + "epoch": 2.758536955531947, + "grad_norm": 0.6818645596504211, + "learning_rate": 5.40447504302926e-05, + "loss": 1.0022, + "step": 9088 + }, + { + "epoch": 2.7588404917286384, + "grad_norm": 0.704681396484375, + "learning_rate": 5.4039688164422394e-05, + "loss": 1.4996, + "step": 9089 + }, + { + "epoch": 2.75914402792533, + "grad_norm": 0.7003867626190186, + "learning_rate": 5.4034625898552196e-05, + "loss": 1.3784, + "step": 9090 + }, + { + "epoch": 2.7594475641220217, + "grad_norm": 0.7893413305282593, + "learning_rate": 5.402956363268199e-05, + "loss": 1.4548, + "step": 9091 + }, + { + "epoch": 2.759751100318713, + "grad_norm": 0.7460066676139832, + "learning_rate": 5.4024501366811785e-05, + "loss": 1.3411, + "step": 9092 + }, + { + "epoch": 2.7600546365154046, + "grad_norm": 0.7303305268287659, + "learning_rate": 5.401943910094158e-05, + "loss": 1.4287, + "step": 9093 + }, + { + "epoch": 2.760358172712096, + "grad_norm": 0.87126225233078, + "learning_rate": 5.401437683507138e-05, + "loss": 1.4208, + "step": 9094 + }, + { + "epoch": 2.7606617089087875, + "grad_norm": 0.7304555177688599, + "learning_rate": 5.400931456920118e-05, + "loss": 1.423, + "step": 9095 + }, + { + "epoch": 2.7609652451054787, + "grad_norm": 0.9491742849349976, + "learning_rate": 5.400425230333097e-05, + "loss": 1.3035, + "step": 9096 + }, + { + "epoch": 2.7612687813021703, + "grad_norm": 0.6813849210739136, + "learning_rate": 5.3999190037460766e-05, + "loss": 1.3806, + "step": 9097 + }, + { + "epoch": 2.761572317498862, + "grad_norm": 0.8438676595687866, + "learning_rate": 5.399412777159056e-05, + "loss": 1.3342, + "step": 9098 + }, + { + "epoch": 2.761875853695553, + "grad_norm": 1.1256211996078491, + "learning_rate": 5.398906550572036e-05, + "loss": 1.2017, + "step": 9099 + }, + { + "epoch": 2.7621793898922444, + "grad_norm": 0.9026702642440796, + "learning_rate": 5.398400323985016e-05, + "loss": 1.218, + "step": 9100 + }, + { + "epoch": 2.762482926088936, + "grad_norm": 0.7552817463874817, + "learning_rate": 5.397894097397995e-05, + "loss": 1.3191, + "step": 9101 + }, + { + "epoch": 2.7627864622856277, + "grad_norm": 0.7226763367652893, + "learning_rate": 5.397387870810975e-05, + "loss": 1.1564, + "step": 9102 + }, + { + "epoch": 2.763089998482319, + "grad_norm": 0.8583779335021973, + "learning_rate": 5.396881644223954e-05, + "loss": 1.2648, + "step": 9103 + }, + { + "epoch": 2.7633935346790106, + "grad_norm": 0.7059534192085266, + "learning_rate": 5.396375417636934e-05, + "loss": 1.2827, + "step": 9104 + }, + { + "epoch": 2.763697070875702, + "grad_norm": 0.7585148811340332, + "learning_rate": 5.395869191049914e-05, + "loss": 1.4881, + "step": 9105 + }, + { + "epoch": 2.7640006070723935, + "grad_norm": 0.7276778817176819, + "learning_rate": 5.395362964462893e-05, + "loss": 1.0925, + "step": 9106 + }, + { + "epoch": 2.7643041432690847, + "grad_norm": 0.835493803024292, + "learning_rate": 5.394856737875873e-05, + "loss": 1.3873, + "step": 9107 + }, + { + "epoch": 2.7646076794657763, + "grad_norm": 0.8759173154830933, + "learning_rate": 5.394350511288853e-05, + "loss": 1.0156, + "step": 9108 + }, + { + "epoch": 2.764911215662468, + "grad_norm": 0.8837414383888245, + "learning_rate": 5.393844284701833e-05, + "loss": 1.512, + "step": 9109 + }, + { + "epoch": 2.765214751859159, + "grad_norm": 0.6770848035812378, + "learning_rate": 5.393338058114813e-05, + "loss": 1.489, + "step": 9110 + }, + { + "epoch": 2.7655182880558504, + "grad_norm": 0.7312859892845154, + "learning_rate": 5.392831831527793e-05, + "loss": 0.9014, + "step": 9111 + }, + { + "epoch": 2.765821824252542, + "grad_norm": 0.6844790577888489, + "learning_rate": 5.392325604940772e-05, + "loss": 1.3431, + "step": 9112 + }, + { + "epoch": 2.7661253604492337, + "grad_norm": 0.7217748165130615, + "learning_rate": 5.391819378353752e-05, + "loss": 1.3692, + "step": 9113 + }, + { + "epoch": 2.766428896645925, + "grad_norm": 0.9370319247245789, + "learning_rate": 5.391313151766731e-05, + "loss": 0.9882, + "step": 9114 + }, + { + "epoch": 2.7667324328426166, + "grad_norm": 0.9519034028053284, + "learning_rate": 5.390806925179711e-05, + "loss": 1.3053, + "step": 9115 + }, + { + "epoch": 2.767035969039308, + "grad_norm": 0.5022948384284973, + "learning_rate": 5.390300698592691e-05, + "loss": 1.9471, + "step": 9116 + }, + { + "epoch": 2.7673395052359995, + "grad_norm": 0.7742490768432617, + "learning_rate": 5.38979447200567e-05, + "loss": 0.8012, + "step": 9117 + }, + { + "epoch": 2.7676430414326907, + "grad_norm": 0.7089831233024597, + "learning_rate": 5.38928824541865e-05, + "loss": 1.3914, + "step": 9118 + }, + { + "epoch": 2.7679465776293823, + "grad_norm": 0.7711650729179382, + "learning_rate": 5.38878201883163e-05, + "loss": 1.1509, + "step": 9119 + }, + { + "epoch": 2.768250113826074, + "grad_norm": 0.7289400696754456, + "learning_rate": 5.3882757922446094e-05, + "loss": 1.184, + "step": 9120 + }, + { + "epoch": 2.768553650022765, + "grad_norm": 0.7543368339538574, + "learning_rate": 5.387769565657589e-05, + "loss": 1.215, + "step": 9121 + }, + { + "epoch": 2.7688571862194564, + "grad_norm": 0.8299732804298401, + "learning_rate": 5.3872633390705684e-05, + "loss": 1.5178, + "step": 9122 + }, + { + "epoch": 2.769160722416148, + "grad_norm": 0.9031606912612915, + "learning_rate": 5.386757112483548e-05, + "loss": 1.237, + "step": 9123 + }, + { + "epoch": 2.7694642586128397, + "grad_norm": 0.7585662007331848, + "learning_rate": 5.386250885896528e-05, + "loss": 1.1908, + "step": 9124 + }, + { + "epoch": 2.769767794809531, + "grad_norm": 0.8985230326652527, + "learning_rate": 5.3857446593095075e-05, + "loss": 1.495, + "step": 9125 + }, + { + "epoch": 2.7700713310062226, + "grad_norm": 0.8069576621055603, + "learning_rate": 5.385238432722487e-05, + "loss": 1.5, + "step": 9126 + }, + { + "epoch": 2.770374867202914, + "grad_norm": 0.9044194221496582, + "learning_rate": 5.3847322061354664e-05, + "loss": 1.0361, + "step": 9127 + }, + { + "epoch": 2.7706784033996055, + "grad_norm": 0.749674379825592, + "learning_rate": 5.384225979548446e-05, + "loss": 1.4786, + "step": 9128 + }, + { + "epoch": 2.7709819395962967, + "grad_norm": 1.505496621131897, + "learning_rate": 5.383719752961426e-05, + "loss": 0.8318, + "step": 9129 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.8201859593391418, + "learning_rate": 5.3832135263744055e-05, + "loss": 1.1839, + "step": 9130 + }, + { + "epoch": 2.77158901198968, + "grad_norm": 0.7365986704826355, + "learning_rate": 5.382707299787385e-05, + "loss": 1.4694, + "step": 9131 + }, + { + "epoch": 2.771892548186371, + "grad_norm": 0.8138245344161987, + "learning_rate": 5.3822010732003645e-05, + "loss": 1.5147, + "step": 9132 + }, + { + "epoch": 2.7721960843830624, + "grad_norm": 0.7796532511711121, + "learning_rate": 5.3816948466133447e-05, + "loss": 0.7226, + "step": 9133 + }, + { + "epoch": 2.772499620579754, + "grad_norm": 0.988336443901062, + "learning_rate": 5.381188620026324e-05, + "loss": 1.3447, + "step": 9134 + }, + { + "epoch": 2.7728031567764457, + "grad_norm": 0.5842739343643188, + "learning_rate": 5.3806823934393036e-05, + "loss": 1.0944, + "step": 9135 + }, + { + "epoch": 2.773106692973137, + "grad_norm": 0.648903489112854, + "learning_rate": 5.380176166852283e-05, + "loss": 1.281, + "step": 9136 + }, + { + "epoch": 2.7734102291698286, + "grad_norm": 0.6385083794593811, + "learning_rate": 5.3796699402652626e-05, + "loss": 1.7031, + "step": 9137 + }, + { + "epoch": 2.77371376536652, + "grad_norm": 0.8002315759658813, + "learning_rate": 5.379163713678243e-05, + "loss": 1.2138, + "step": 9138 + }, + { + "epoch": 2.7740173015632115, + "grad_norm": 0.8462424278259277, + "learning_rate": 5.378657487091222e-05, + "loss": 1.4935, + "step": 9139 + }, + { + "epoch": 2.7743208377599027, + "grad_norm": 0.8711588382720947, + "learning_rate": 5.378151260504202e-05, + "loss": 1.3314, + "step": 9140 + }, + { + "epoch": 2.7746243739565943, + "grad_norm": 1.0606752634048462, + "learning_rate": 5.377645033917181e-05, + "loss": 1.1583, + "step": 9141 + }, + { + "epoch": 2.774927910153286, + "grad_norm": 0.8558630347251892, + "learning_rate": 5.3771388073301607e-05, + "loss": 1.2429, + "step": 9142 + }, + { + "epoch": 2.775231446349977, + "grad_norm": 0.7171871662139893, + "learning_rate": 5.376632580743141e-05, + "loss": 1.595, + "step": 9143 + }, + { + "epoch": 2.7755349825466684, + "grad_norm": 0.7023593187332153, + "learning_rate": 5.37612635415612e-05, + "loss": 1.0014, + "step": 9144 + }, + { + "epoch": 2.77583851874336, + "grad_norm": 0.8242972493171692, + "learning_rate": 5.3756201275691e-05, + "loss": 1.496, + "step": 9145 + }, + { + "epoch": 2.7761420549400517, + "grad_norm": 0.9640794396400452, + "learning_rate": 5.375113900982079e-05, + "loss": 1.5646, + "step": 9146 + }, + { + "epoch": 2.776445591136743, + "grad_norm": 0.6254851222038269, + "learning_rate": 5.3746076743950594e-05, + "loss": 1.2525, + "step": 9147 + }, + { + "epoch": 2.7767491273334346, + "grad_norm": 0.7365807294845581, + "learning_rate": 5.374101447808039e-05, + "loss": 1.3743, + "step": 9148 + }, + { + "epoch": 2.777052663530126, + "grad_norm": 0.8320026397705078, + "learning_rate": 5.3735952212210184e-05, + "loss": 1.5029, + "step": 9149 + }, + { + "epoch": 2.7773561997268175, + "grad_norm": 0.8140934109687805, + "learning_rate": 5.373088994633998e-05, + "loss": 0.7684, + "step": 9150 + }, + { + "epoch": 2.7776597359235087, + "grad_norm": 0.6923708319664001, + "learning_rate": 5.372582768046977e-05, + "loss": 1.2284, + "step": 9151 + }, + { + "epoch": 2.7779632721202003, + "grad_norm": 0.838914692401886, + "learning_rate": 5.3720765414599575e-05, + "loss": 1.2168, + "step": 9152 + }, + { + "epoch": 2.778266808316892, + "grad_norm": 0.8263568878173828, + "learning_rate": 5.3715703148729376e-05, + "loss": 1.4743, + "step": 9153 + }, + { + "epoch": 2.778570344513583, + "grad_norm": 0.7731677889823914, + "learning_rate": 5.371064088285918e-05, + "loss": 1.4636, + "step": 9154 + }, + { + "epoch": 2.778873880710275, + "grad_norm": 0.8379889130592346, + "learning_rate": 5.370557861698897e-05, + "loss": 1.5557, + "step": 9155 + }, + { + "epoch": 2.779177416906966, + "grad_norm": 0.8932848572731018, + "learning_rate": 5.370051635111877e-05, + "loss": 1.1184, + "step": 9156 + }, + { + "epoch": 2.7794809531036577, + "grad_norm": 0.6494554877281189, + "learning_rate": 5.369545408524856e-05, + "loss": 1.4887, + "step": 9157 + }, + { + "epoch": 2.779784489300349, + "grad_norm": 0.7722667455673218, + "learning_rate": 5.3690391819378364e-05, + "loss": 1.0282, + "step": 9158 + }, + { + "epoch": 2.7800880254970406, + "grad_norm": 0.7183326482772827, + "learning_rate": 5.368532955350816e-05, + "loss": 1.3558, + "step": 9159 + }, + { + "epoch": 2.7803915616937322, + "grad_norm": 0.7643415927886963, + "learning_rate": 5.3680267287637953e-05, + "loss": 0.8731, + "step": 9160 + }, + { + "epoch": 2.7806950978904235, + "grad_norm": 1.0144984722137451, + "learning_rate": 5.367520502176775e-05, + "loss": 0.7752, + "step": 9161 + }, + { + "epoch": 2.7809986340871147, + "grad_norm": 0.7115720510482788, + "learning_rate": 5.367014275589754e-05, + "loss": 1.4928, + "step": 9162 + }, + { + "epoch": 2.7813021702838063, + "grad_norm": 0.9345402717590332, + "learning_rate": 5.3665080490027345e-05, + "loss": 1.2511, + "step": 9163 + }, + { + "epoch": 2.781605706480498, + "grad_norm": 0.8848560452461243, + "learning_rate": 5.366001822415714e-05, + "loss": 1.4007, + "step": 9164 + }, + { + "epoch": 2.781909242677189, + "grad_norm": 0.7608636021614075, + "learning_rate": 5.3654955958286934e-05, + "loss": 0.6799, + "step": 9165 + }, + { + "epoch": 2.782212778873881, + "grad_norm": 0.9123556613922119, + "learning_rate": 5.364989369241673e-05, + "loss": 0.9096, + "step": 9166 + }, + { + "epoch": 2.782516315070572, + "grad_norm": 0.9755141139030457, + "learning_rate": 5.3644831426546524e-05, + "loss": 1.336, + "step": 9167 + }, + { + "epoch": 2.7828198512672637, + "grad_norm": 0.8301700949668884, + "learning_rate": 5.3639769160676325e-05, + "loss": 1.6611, + "step": 9168 + }, + { + "epoch": 2.783123387463955, + "grad_norm": 0.6016411185264587, + "learning_rate": 5.363470689480612e-05, + "loss": 1.6561, + "step": 9169 + }, + { + "epoch": 2.7834269236606466, + "grad_norm": 0.8668585419654846, + "learning_rate": 5.3629644628935915e-05, + "loss": 1.1069, + "step": 9170 + }, + { + "epoch": 2.7837304598573382, + "grad_norm": 0.855521023273468, + "learning_rate": 5.362458236306571e-05, + "loss": 1.4181, + "step": 9171 + }, + { + "epoch": 2.7840339960540295, + "grad_norm": 0.7183941602706909, + "learning_rate": 5.361952009719551e-05, + "loss": 1.4732, + "step": 9172 + }, + { + "epoch": 2.7843375322507207, + "grad_norm": 0.6905990839004517, + "learning_rate": 5.3614457831325306e-05, + "loss": 1.6827, + "step": 9173 + }, + { + "epoch": 2.7846410684474123, + "grad_norm": 0.5364888906478882, + "learning_rate": 5.36093955654551e-05, + "loss": 0.8329, + "step": 9174 + }, + { + "epoch": 2.784944604644104, + "grad_norm": 0.7076054215431213, + "learning_rate": 5.3604333299584896e-05, + "loss": 0.5921, + "step": 9175 + }, + { + "epoch": 2.785248140840795, + "grad_norm": 0.8610938191413879, + "learning_rate": 5.359927103371469e-05, + "loss": 1.206, + "step": 9176 + }, + { + "epoch": 2.785551677037487, + "grad_norm": 0.8130373954772949, + "learning_rate": 5.359420876784449e-05, + "loss": 1.4456, + "step": 9177 + }, + { + "epoch": 2.785855213234178, + "grad_norm": 0.6824911236763, + "learning_rate": 5.358914650197429e-05, + "loss": 1.5016, + "step": 9178 + }, + { + "epoch": 2.7861587494308697, + "grad_norm": 0.9456865191459656, + "learning_rate": 5.358408423610408e-05, + "loss": 1.3263, + "step": 9179 + }, + { + "epoch": 2.786462285627561, + "grad_norm": 0.8320668339729309, + "learning_rate": 5.3579021970233876e-05, + "loss": 0.8731, + "step": 9180 + }, + { + "epoch": 2.7867658218242526, + "grad_norm": 1.1539381742477417, + "learning_rate": 5.357395970436367e-05, + "loss": 1.0368, + "step": 9181 + }, + { + "epoch": 2.7870693580209442, + "grad_norm": 0.869157075881958, + "learning_rate": 5.356889743849347e-05, + "loss": 1.3705, + "step": 9182 + }, + { + "epoch": 2.7873728942176355, + "grad_norm": 0.7219955325126648, + "learning_rate": 5.356383517262327e-05, + "loss": 1.7219, + "step": 9183 + }, + { + "epoch": 2.7876764304143267, + "grad_norm": 1.016178011894226, + "learning_rate": 5.355877290675306e-05, + "loss": 1.5799, + "step": 9184 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.8265878558158875, + "learning_rate": 5.355371064088286e-05, + "loss": 1.2097, + "step": 9185 + }, + { + "epoch": 2.78828350280771, + "grad_norm": 0.7330758571624756, + "learning_rate": 5.354864837501266e-05, + "loss": 1.0995, + "step": 9186 + }, + { + "epoch": 2.788587039004401, + "grad_norm": 0.733619213104248, + "learning_rate": 5.3543586109142454e-05, + "loss": 1.6089, + "step": 9187 + }, + { + "epoch": 2.788890575201093, + "grad_norm": 0.6172709465026855, + "learning_rate": 5.353852384327225e-05, + "loss": 1.6118, + "step": 9188 + }, + { + "epoch": 2.789194111397784, + "grad_norm": 0.7603530883789062, + "learning_rate": 5.353346157740204e-05, + "loss": 1.6177, + "step": 9189 + }, + { + "epoch": 2.7894976475944757, + "grad_norm": 0.6322783827781677, + "learning_rate": 5.352839931153184e-05, + "loss": 1.4191, + "step": 9190 + }, + { + "epoch": 2.789801183791167, + "grad_norm": 0.8380208015441895, + "learning_rate": 5.352333704566164e-05, + "loss": 0.8711, + "step": 9191 + }, + { + "epoch": 2.7901047199878586, + "grad_norm": 0.866492509841919, + "learning_rate": 5.3518274779791434e-05, + "loss": 1.7399, + "step": 9192 + }, + { + "epoch": 2.7904082561845502, + "grad_norm": 0.8822951316833496, + "learning_rate": 5.351321251392123e-05, + "loss": 1.0067, + "step": 9193 + }, + { + "epoch": 2.7907117923812415, + "grad_norm": 0.7827437520027161, + "learning_rate": 5.3508150248051024e-05, + "loss": 1.5419, + "step": 9194 + }, + { + "epoch": 2.7910153285779327, + "grad_norm": 0.6779170632362366, + "learning_rate": 5.350308798218082e-05, + "loss": 1.5052, + "step": 9195 + }, + { + "epoch": 2.7913188647746243, + "grad_norm": 0.8421553373336792, + "learning_rate": 5.349802571631062e-05, + "loss": 1.3913, + "step": 9196 + }, + { + "epoch": 2.791622400971316, + "grad_norm": 0.8894978761672974, + "learning_rate": 5.3492963450440415e-05, + "loss": 1.2967, + "step": 9197 + }, + { + "epoch": 2.791925937168007, + "grad_norm": 0.8642515540122986, + "learning_rate": 5.3487901184570223e-05, + "loss": 1.5373, + "step": 9198 + }, + { + "epoch": 2.792229473364699, + "grad_norm": 0.8763284683227539, + "learning_rate": 5.348283891870002e-05, + "loss": 1.5245, + "step": 9199 + }, + { + "epoch": 2.79253300956139, + "grad_norm": 0.8263570070266724, + "learning_rate": 5.347777665282981e-05, + "loss": 1.5697, + "step": 9200 + }, + { + "epoch": 2.7928365457580817, + "grad_norm": 0.8536450266838074, + "learning_rate": 5.347271438695961e-05, + "loss": 1.5989, + "step": 9201 + }, + { + "epoch": 2.793140081954773, + "grad_norm": 0.8243657350540161, + "learning_rate": 5.346765212108941e-05, + "loss": 1.4002, + "step": 9202 + }, + { + "epoch": 2.7934436181514646, + "grad_norm": 0.7956419587135315, + "learning_rate": 5.3462589855219204e-05, + "loss": 1.5728, + "step": 9203 + }, + { + "epoch": 2.7937471543481562, + "grad_norm": 0.7345757484436035, + "learning_rate": 5.3457527589349e-05, + "loss": 1.6434, + "step": 9204 + }, + { + "epoch": 2.7940506905448474, + "grad_norm": 0.7123274803161621, + "learning_rate": 5.3452465323478794e-05, + "loss": 1.248, + "step": 9205 + }, + { + "epoch": 2.7943542267415387, + "grad_norm": 0.8604891300201416, + "learning_rate": 5.344740305760859e-05, + "loss": 1.3854, + "step": 9206 + }, + { + "epoch": 2.7946577629382303, + "grad_norm": 0.7543156743049622, + "learning_rate": 5.344234079173839e-05, + "loss": 1.1674, + "step": 9207 + }, + { + "epoch": 2.794961299134922, + "grad_norm": 0.6968750357627869, + "learning_rate": 5.3437278525868185e-05, + "loss": 1.6849, + "step": 9208 + }, + { + "epoch": 2.795264835331613, + "grad_norm": 0.6939352750778198, + "learning_rate": 5.343221625999798e-05, + "loss": 1.4659, + "step": 9209 + }, + { + "epoch": 2.795568371528305, + "grad_norm": 0.8209272027015686, + "learning_rate": 5.3427153994127775e-05, + "loss": 1.0141, + "step": 9210 + }, + { + "epoch": 2.795871907724996, + "grad_norm": 0.7585069537162781, + "learning_rate": 5.3422091728257576e-05, + "loss": 1.4965, + "step": 9211 + }, + { + "epoch": 2.7961754439216877, + "grad_norm": 0.7784894704818726, + "learning_rate": 5.341702946238737e-05, + "loss": 1.7735, + "step": 9212 + }, + { + "epoch": 2.796478980118379, + "grad_norm": 0.6751682758331299, + "learning_rate": 5.3411967196517166e-05, + "loss": 1.753, + "step": 9213 + }, + { + "epoch": 2.7967825163150706, + "grad_norm": 0.8344226479530334, + "learning_rate": 5.340690493064696e-05, + "loss": 1.3489, + "step": 9214 + }, + { + "epoch": 2.7970860525117622, + "grad_norm": 0.671781063079834, + "learning_rate": 5.3401842664776755e-05, + "loss": 1.5133, + "step": 9215 + }, + { + "epoch": 2.7973895887084534, + "grad_norm": 0.5930084586143494, + "learning_rate": 5.339678039890656e-05, + "loss": 0.9884, + "step": 9216 + }, + { + "epoch": 2.797693124905145, + "grad_norm": 0.8687995076179504, + "learning_rate": 5.339171813303635e-05, + "loss": 1.4782, + "step": 9217 + }, + { + "epoch": 2.7979966611018363, + "grad_norm": 0.9404574036598206, + "learning_rate": 5.3386655867166146e-05, + "loss": 1.373, + "step": 9218 + }, + { + "epoch": 2.798300197298528, + "grad_norm": 0.4988918900489807, + "learning_rate": 5.338159360129594e-05, + "loss": 1.2758, + "step": 9219 + }, + { + "epoch": 2.798603733495219, + "grad_norm": 0.7535762786865234, + "learning_rate": 5.3376531335425736e-05, + "loss": 0.9864, + "step": 9220 + }, + { + "epoch": 2.798907269691911, + "grad_norm": 0.8633400797843933, + "learning_rate": 5.337146906955554e-05, + "loss": 1.5846, + "step": 9221 + }, + { + "epoch": 2.799210805888602, + "grad_norm": 1.015762209892273, + "learning_rate": 5.336640680368533e-05, + "loss": 1.4976, + "step": 9222 + }, + { + "epoch": 2.7995143420852937, + "grad_norm": 0.7161250114440918, + "learning_rate": 5.336134453781513e-05, + "loss": 1.5172, + "step": 9223 + }, + { + "epoch": 2.799817878281985, + "grad_norm": 0.7022544741630554, + "learning_rate": 5.335628227194492e-05, + "loss": 1.1552, + "step": 9224 + }, + { + "epoch": 2.8001214144786766, + "grad_norm": 0.6041155457496643, + "learning_rate": 5.3351220006074724e-05, + "loss": 1.0041, + "step": 9225 + }, + { + "epoch": 2.8004249506753682, + "grad_norm": 0.7977640628814697, + "learning_rate": 5.334615774020452e-05, + "loss": 1.6374, + "step": 9226 + }, + { + "epoch": 2.8007284868720594, + "grad_norm": 0.7905417680740356, + "learning_rate": 5.334109547433431e-05, + "loss": 1.4101, + "step": 9227 + }, + { + "epoch": 2.801032023068751, + "grad_norm": 0.6630216836929321, + "learning_rate": 5.333603320846411e-05, + "loss": 1.0058, + "step": 9228 + }, + { + "epoch": 2.8013355592654423, + "grad_norm": 0.8783179521560669, + "learning_rate": 5.33309709425939e-05, + "loss": 1.5429, + "step": 9229 + }, + { + "epoch": 2.801639095462134, + "grad_norm": 0.9275266528129578, + "learning_rate": 5.3325908676723704e-05, + "loss": 1.2968, + "step": 9230 + }, + { + "epoch": 2.801942631658825, + "grad_norm": 0.8065756559371948, + "learning_rate": 5.33208464108535e-05, + "loss": 1.1305, + "step": 9231 + }, + { + "epoch": 2.802246167855517, + "grad_norm": 0.7965754270553589, + "learning_rate": 5.3315784144983294e-05, + "loss": 1.4949, + "step": 9232 + }, + { + "epoch": 2.8025497040522085, + "grad_norm": 0.7255224585533142, + "learning_rate": 5.331072187911309e-05, + "loss": 1.0829, + "step": 9233 + }, + { + "epoch": 2.8028532402488997, + "grad_norm": 0.855423629283905, + "learning_rate": 5.3305659613242883e-05, + "loss": 1.379, + "step": 9234 + }, + { + "epoch": 2.803156776445591, + "grad_norm": 0.8398419618606567, + "learning_rate": 5.3300597347372685e-05, + "loss": 1.4714, + "step": 9235 + }, + { + "epoch": 2.8034603126422826, + "grad_norm": 0.7005195021629333, + "learning_rate": 5.329553508150248e-05, + "loss": 0.9738, + "step": 9236 + }, + { + "epoch": 2.8037638488389742, + "grad_norm": 0.8335183262825012, + "learning_rate": 5.3290472815632275e-05, + "loss": 0.8508, + "step": 9237 + }, + { + "epoch": 2.8040673850356654, + "grad_norm": 0.7825225591659546, + "learning_rate": 5.328541054976207e-05, + "loss": 1.3419, + "step": 9238 + }, + { + "epoch": 2.804370921232357, + "grad_norm": 0.4875824749469757, + "learning_rate": 5.328034828389187e-05, + "loss": 1.7436, + "step": 9239 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.7876448631286621, + "learning_rate": 5.3275286018021666e-05, + "loss": 1.1507, + "step": 9240 + }, + { + "epoch": 2.80497799362574, + "grad_norm": 0.7508234977722168, + "learning_rate": 5.327022375215146e-05, + "loss": 0.9244, + "step": 9241 + }, + { + "epoch": 2.805281529822431, + "grad_norm": 0.6753659844398499, + "learning_rate": 5.326516148628127e-05, + "loss": 1.5898, + "step": 9242 + }, + { + "epoch": 2.805585066019123, + "grad_norm": 0.5607844591140747, + "learning_rate": 5.3260099220411064e-05, + "loss": 1.2225, + "step": 9243 + }, + { + "epoch": 2.8058886022158145, + "grad_norm": 0.6955126523971558, + "learning_rate": 5.325503695454086e-05, + "loss": 1.4762, + "step": 9244 + }, + { + "epoch": 2.8061921384125057, + "grad_norm": 0.8268750905990601, + "learning_rate": 5.324997468867065e-05, + "loss": 1.2952, + "step": 9245 + }, + { + "epoch": 2.806495674609197, + "grad_norm": 0.9716630578041077, + "learning_rate": 5.3244912422800455e-05, + "loss": 1.4365, + "step": 9246 + }, + { + "epoch": 2.8067992108058886, + "grad_norm": 0.819372832775116, + "learning_rate": 5.323985015693025e-05, + "loss": 1.2058, + "step": 9247 + }, + { + "epoch": 2.8071027470025802, + "grad_norm": 0.7395895719528198, + "learning_rate": 5.3234787891060044e-05, + "loss": 1.7736, + "step": 9248 + }, + { + "epoch": 2.8074062831992714, + "grad_norm": 0.6357772350311279, + "learning_rate": 5.322972562518984e-05, + "loss": 1.381, + "step": 9249 + }, + { + "epoch": 2.807709819395963, + "grad_norm": 0.7322058081626892, + "learning_rate": 5.322466335931964e-05, + "loss": 1.192, + "step": 9250 + }, + { + "epoch": 2.8080133555926543, + "grad_norm": 0.8182588815689087, + "learning_rate": 5.3219601093449436e-05, + "loss": 1.0976, + "step": 9251 + }, + { + "epoch": 2.808316891789346, + "grad_norm": 0.6597992181777954, + "learning_rate": 5.321453882757923e-05, + "loss": 0.9726, + "step": 9252 + }, + { + "epoch": 2.808620427986037, + "grad_norm": 0.827843189239502, + "learning_rate": 5.3209476561709025e-05, + "loss": 1.2613, + "step": 9253 + }, + { + "epoch": 2.808923964182729, + "grad_norm": 0.7336700558662415, + "learning_rate": 5.320441429583882e-05, + "loss": 1.4526, + "step": 9254 + }, + { + "epoch": 2.8092275003794205, + "grad_norm": 0.9792813658714294, + "learning_rate": 5.319935202996862e-05, + "loss": 0.973, + "step": 9255 + }, + { + "epoch": 2.8095310365761117, + "grad_norm": 0.8120543956756592, + "learning_rate": 5.3194289764098416e-05, + "loss": 1.2121, + "step": 9256 + }, + { + "epoch": 2.809834572772803, + "grad_norm": 0.8667252063751221, + "learning_rate": 5.318922749822821e-05, + "loss": 0.975, + "step": 9257 + }, + { + "epoch": 2.8101381089694946, + "grad_norm": 0.6807832717895508, + "learning_rate": 5.3184165232358006e-05, + "loss": 1.6509, + "step": 9258 + }, + { + "epoch": 2.810441645166186, + "grad_norm": 0.8428423404693604, + "learning_rate": 5.31791029664878e-05, + "loss": 1.1053, + "step": 9259 + }, + { + "epoch": 2.8107451813628774, + "grad_norm": 0.6943228840827942, + "learning_rate": 5.31740407006176e-05, + "loss": 0.7154, + "step": 9260 + }, + { + "epoch": 2.811048717559569, + "grad_norm": 0.6292949318885803, + "learning_rate": 5.31689784347474e-05, + "loss": 1.6805, + "step": 9261 + }, + { + "epoch": 2.8113522537562603, + "grad_norm": 0.8613457083702087, + "learning_rate": 5.316391616887719e-05, + "loss": 1.2253, + "step": 9262 + }, + { + "epoch": 2.811655789952952, + "grad_norm": 0.8359972238540649, + "learning_rate": 5.315885390300699e-05, + "loss": 1.5825, + "step": 9263 + }, + { + "epoch": 2.811959326149643, + "grad_norm": 0.7626785635948181, + "learning_rate": 5.315379163713679e-05, + "loss": 1.5913, + "step": 9264 + }, + { + "epoch": 2.812262862346335, + "grad_norm": 0.8913024067878723, + "learning_rate": 5.314872937126658e-05, + "loss": 1.0195, + "step": 9265 + }, + { + "epoch": 2.8125663985430265, + "grad_norm": 0.7775774002075195, + "learning_rate": 5.314366710539638e-05, + "loss": 1.4487, + "step": 9266 + }, + { + "epoch": 2.8128699347397177, + "grad_norm": 0.8799665570259094, + "learning_rate": 5.313860483952617e-05, + "loss": 1.3918, + "step": 9267 + }, + { + "epoch": 2.813173470936409, + "grad_norm": 0.7530032396316528, + "learning_rate": 5.313354257365597e-05, + "loss": 1.4407, + "step": 9268 + }, + { + "epoch": 2.8134770071331006, + "grad_norm": 0.8552557826042175, + "learning_rate": 5.312848030778577e-05, + "loss": 1.1304, + "step": 9269 + }, + { + "epoch": 2.813780543329792, + "grad_norm": 0.8629938364028931, + "learning_rate": 5.3123418041915564e-05, + "loss": 1.0011, + "step": 9270 + }, + { + "epoch": 2.8140840795264834, + "grad_norm": 0.9533644914627075, + "learning_rate": 5.311835577604536e-05, + "loss": 1.3258, + "step": 9271 + }, + { + "epoch": 2.814387615723175, + "grad_norm": 0.6983219385147095, + "learning_rate": 5.3113293510175153e-05, + "loss": 0.9842, + "step": 9272 + }, + { + "epoch": 2.8146911519198663, + "grad_norm": 0.6916873455047607, + "learning_rate": 5.310823124430495e-05, + "loss": 1.4564, + "step": 9273 + }, + { + "epoch": 2.814994688116558, + "grad_norm": 0.5489389896392822, + "learning_rate": 5.310316897843475e-05, + "loss": 0.6755, + "step": 9274 + }, + { + "epoch": 2.815298224313249, + "grad_norm": 1.0498082637786865, + "learning_rate": 5.3098106712564545e-05, + "loss": 1.3059, + "step": 9275 + }, + { + "epoch": 2.815601760509941, + "grad_norm": 1.0044301748275757, + "learning_rate": 5.309304444669434e-05, + "loss": 1.3754, + "step": 9276 + }, + { + "epoch": 2.8159052967066325, + "grad_norm": 0.8365218639373779, + "learning_rate": 5.3087982180824134e-05, + "loss": 1.1857, + "step": 9277 + }, + { + "epoch": 2.8162088329033237, + "grad_norm": 0.8548251390457153, + "learning_rate": 5.3082919914953936e-05, + "loss": 1.1869, + "step": 9278 + }, + { + "epoch": 2.816512369100015, + "grad_norm": 0.9293504357337952, + "learning_rate": 5.307785764908373e-05, + "loss": 1.1865, + "step": 9279 + }, + { + "epoch": 2.8168159052967066, + "grad_norm": 0.7599900960922241, + "learning_rate": 5.3072795383213525e-05, + "loss": 1.4314, + "step": 9280 + }, + { + "epoch": 2.817119441493398, + "grad_norm": 0.706532895565033, + "learning_rate": 5.306773311734332e-05, + "loss": 1.4149, + "step": 9281 + }, + { + "epoch": 2.8174229776900894, + "grad_norm": 0.8958925604820251, + "learning_rate": 5.3062670851473115e-05, + "loss": 1.2663, + "step": 9282 + }, + { + "epoch": 2.817726513886781, + "grad_norm": 0.7330815196037292, + "learning_rate": 5.3057608585602917e-05, + "loss": 1.4551, + "step": 9283 + }, + { + "epoch": 2.8180300500834723, + "grad_norm": 0.6814063191413879, + "learning_rate": 5.305254631973271e-05, + "loss": 1.4194, + "step": 9284 + }, + { + "epoch": 2.818333586280164, + "grad_norm": 0.9556673765182495, + "learning_rate": 5.3047484053862506e-05, + "loss": 1.2468, + "step": 9285 + }, + { + "epoch": 2.818637122476855, + "grad_norm": 1.0090731382369995, + "learning_rate": 5.30424217879923e-05, + "loss": 1.3038, + "step": 9286 + }, + { + "epoch": 2.818940658673547, + "grad_norm": 0.7862967252731323, + "learning_rate": 5.303735952212211e-05, + "loss": 1.5091, + "step": 9287 + }, + { + "epoch": 2.8192441948702385, + "grad_norm": 0.9822673201560974, + "learning_rate": 5.3032297256251904e-05, + "loss": 1.1643, + "step": 9288 + }, + { + "epoch": 2.8195477310669297, + "grad_norm": 0.6395041942596436, + "learning_rate": 5.3027234990381706e-05, + "loss": 1.2098, + "step": 9289 + }, + { + "epoch": 2.8198512672636213, + "grad_norm": 0.846056342124939, + "learning_rate": 5.30221727245115e-05, + "loss": 1.0674, + "step": 9290 + }, + { + "epoch": 2.8201548034603126, + "grad_norm": 0.7783642411231995, + "learning_rate": 5.3017110458641295e-05, + "loss": 1.552, + "step": 9291 + }, + { + "epoch": 2.820458339657004, + "grad_norm": 0.7343548536300659, + "learning_rate": 5.301204819277109e-05, + "loss": 1.2193, + "step": 9292 + }, + { + "epoch": 2.8207618758536954, + "grad_norm": 0.8262844085693359, + "learning_rate": 5.3006985926900885e-05, + "loss": 1.2968, + "step": 9293 + }, + { + "epoch": 2.821065412050387, + "grad_norm": 0.9042322635650635, + "learning_rate": 5.3001923661030686e-05, + "loss": 1.1364, + "step": 9294 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.6058719158172607, + "learning_rate": 5.299686139516048e-05, + "loss": 0.4124, + "step": 9295 + }, + { + "epoch": 2.82167248444377, + "grad_norm": 0.8176096677780151, + "learning_rate": 5.2991799129290276e-05, + "loss": 1.142, + "step": 9296 + }, + { + "epoch": 2.821976020640461, + "grad_norm": 1.1081751585006714, + "learning_rate": 5.298673686342007e-05, + "loss": 1.0948, + "step": 9297 + }, + { + "epoch": 2.822279556837153, + "grad_norm": 0.6902355551719666, + "learning_rate": 5.2981674597549866e-05, + "loss": 1.6632, + "step": 9298 + }, + { + "epoch": 2.8225830930338445, + "grad_norm": 0.7224792242050171, + "learning_rate": 5.297661233167967e-05, + "loss": 1.0262, + "step": 9299 + }, + { + "epoch": 2.8228866292305357, + "grad_norm": 0.6862173676490784, + "learning_rate": 5.297155006580946e-05, + "loss": 0.8252, + "step": 9300 + }, + { + "epoch": 2.8231901654272273, + "grad_norm": 0.9318156242370605, + "learning_rate": 5.296648779993926e-05, + "loss": 1.1843, + "step": 9301 + }, + { + "epoch": 2.8234937016239186, + "grad_norm": 0.7357593178749084, + "learning_rate": 5.296142553406905e-05, + "loss": 1.3198, + "step": 9302 + }, + { + "epoch": 2.82379723782061, + "grad_norm": 0.9271796941757202, + "learning_rate": 5.295636326819885e-05, + "loss": 1.2251, + "step": 9303 + }, + { + "epoch": 2.8241007740173014, + "grad_norm": 0.6771527528762817, + "learning_rate": 5.295130100232865e-05, + "loss": 1.0557, + "step": 9304 + }, + { + "epoch": 2.824404310213993, + "grad_norm": 0.6915475726127625, + "learning_rate": 5.294623873645844e-05, + "loss": 1.5614, + "step": 9305 + }, + { + "epoch": 2.8247078464106847, + "grad_norm": 0.8248263001441956, + "learning_rate": 5.294117647058824e-05, + "loss": 1.8742, + "step": 9306 + }, + { + "epoch": 2.825011382607376, + "grad_norm": 0.7455428838729858, + "learning_rate": 5.293611420471803e-05, + "loss": 0.9254, + "step": 9307 + }, + { + "epoch": 2.825314918804067, + "grad_norm": 0.783444881439209, + "learning_rate": 5.2931051938847834e-05, + "loss": 1.0196, + "step": 9308 + }, + { + "epoch": 2.825618455000759, + "grad_norm": 0.6387648582458496, + "learning_rate": 5.292598967297763e-05, + "loss": 1.3538, + "step": 9309 + }, + { + "epoch": 2.8259219911974505, + "grad_norm": 0.7413560152053833, + "learning_rate": 5.292092740710742e-05, + "loss": 1.384, + "step": 9310 + }, + { + "epoch": 2.8262255273941417, + "grad_norm": 0.7992648482322693, + "learning_rate": 5.291586514123722e-05, + "loss": 1.6309, + "step": 9311 + }, + { + "epoch": 2.8265290635908333, + "grad_norm": 0.6895065307617188, + "learning_rate": 5.291080287536701e-05, + "loss": 1.3435, + "step": 9312 + }, + { + "epoch": 2.8268325997875245, + "grad_norm": 0.7920868396759033, + "learning_rate": 5.2905740609496815e-05, + "loss": 1.6338, + "step": 9313 + }, + { + "epoch": 2.827136135984216, + "grad_norm": 0.6923069953918457, + "learning_rate": 5.290067834362661e-05, + "loss": 1.5719, + "step": 9314 + }, + { + "epoch": 2.8274396721809074, + "grad_norm": 0.708824872970581, + "learning_rate": 5.2895616077756404e-05, + "loss": 1.4207, + "step": 9315 + }, + { + "epoch": 2.827743208377599, + "grad_norm": 0.6963658928871155, + "learning_rate": 5.28905538118862e-05, + "loss": 1.5641, + "step": 9316 + }, + { + "epoch": 2.8280467445742907, + "grad_norm": 0.7942947149276733, + "learning_rate": 5.2885491546016e-05, + "loss": 1.0428, + "step": 9317 + }, + { + "epoch": 2.828350280770982, + "grad_norm": 0.6340756416320801, + "learning_rate": 5.2880429280145795e-05, + "loss": 1.6218, + "step": 9318 + }, + { + "epoch": 2.828653816967673, + "grad_norm": 0.9371388554573059, + "learning_rate": 5.287536701427559e-05, + "loss": 1.3161, + "step": 9319 + }, + { + "epoch": 2.828957353164365, + "grad_norm": 0.912032425403595, + "learning_rate": 5.2870304748405385e-05, + "loss": 1.6321, + "step": 9320 + }, + { + "epoch": 2.8292608893610565, + "grad_norm": 0.6722336411476135, + "learning_rate": 5.286524248253518e-05, + "loss": 1.71, + "step": 9321 + }, + { + "epoch": 2.8295644255577477, + "grad_norm": 0.7177336812019348, + "learning_rate": 5.286018021666498e-05, + "loss": 1.2445, + "step": 9322 + }, + { + "epoch": 2.8298679617544393, + "grad_norm": 0.7678077220916748, + "learning_rate": 5.2855117950794776e-05, + "loss": 1.413, + "step": 9323 + }, + { + "epoch": 2.8301714979511305, + "grad_norm": 0.7351728677749634, + "learning_rate": 5.285005568492457e-05, + "loss": 1.2427, + "step": 9324 + }, + { + "epoch": 2.830475034147822, + "grad_norm": 0.7685090899467468, + "learning_rate": 5.2844993419054366e-05, + "loss": 1.4481, + "step": 9325 + }, + { + "epoch": 2.8307785703445134, + "grad_norm": 0.6873645186424255, + "learning_rate": 5.283993115318416e-05, + "loss": 1.485, + "step": 9326 + }, + { + "epoch": 2.831082106541205, + "grad_norm": 0.8315947651863098, + "learning_rate": 5.283486888731396e-05, + "loss": 1.1884, + "step": 9327 + }, + { + "epoch": 2.8313856427378967, + "grad_norm": 0.7474330067634583, + "learning_rate": 5.282980662144376e-05, + "loss": 1.5404, + "step": 9328 + }, + { + "epoch": 2.831689178934588, + "grad_norm": 0.7282052040100098, + "learning_rate": 5.282474435557355e-05, + "loss": 1.4322, + "step": 9329 + }, + { + "epoch": 2.831992715131279, + "grad_norm": 0.6186273694038391, + "learning_rate": 5.2819682089703346e-05, + "loss": 1.7328, + "step": 9330 + }, + { + "epoch": 2.832296251327971, + "grad_norm": 0.6695376634597778, + "learning_rate": 5.2814619823833155e-05, + "loss": 1.4427, + "step": 9331 + }, + { + "epoch": 2.8325997875246625, + "grad_norm": 0.7590188980102539, + "learning_rate": 5.280955755796295e-05, + "loss": 1.2487, + "step": 9332 + }, + { + "epoch": 2.8329033237213537, + "grad_norm": 0.7978704571723938, + "learning_rate": 5.280449529209275e-05, + "loss": 1.5149, + "step": 9333 + }, + { + "epoch": 2.8332068599180453, + "grad_norm": 0.7168574333190918, + "learning_rate": 5.2799433026222546e-05, + "loss": 1.7076, + "step": 9334 + }, + { + "epoch": 2.8335103961147365, + "grad_norm": 1.048042893409729, + "learning_rate": 5.279437076035234e-05, + "loss": 0.8254, + "step": 9335 + }, + { + "epoch": 2.833813932311428, + "grad_norm": 0.7505400776863098, + "learning_rate": 5.2789308494482135e-05, + "loss": 0.9113, + "step": 9336 + }, + { + "epoch": 2.8341174685081194, + "grad_norm": 0.7891983985900879, + "learning_rate": 5.278424622861193e-05, + "loss": 1.6025, + "step": 9337 + }, + { + "epoch": 2.834421004704811, + "grad_norm": 0.8563476204872131, + "learning_rate": 5.277918396274173e-05, + "loss": 1.3948, + "step": 9338 + }, + { + "epoch": 2.8347245409015027, + "grad_norm": 0.7944004535675049, + "learning_rate": 5.277412169687153e-05, + "loss": 1.6755, + "step": 9339 + }, + { + "epoch": 2.835028077098194, + "grad_norm": 0.8097591996192932, + "learning_rate": 5.276905943100132e-05, + "loss": 1.6187, + "step": 9340 + }, + { + "epoch": 2.835331613294885, + "grad_norm": 0.6555790305137634, + "learning_rate": 5.2763997165131116e-05, + "loss": 1.4747, + "step": 9341 + }, + { + "epoch": 2.835635149491577, + "grad_norm": 0.7448464632034302, + "learning_rate": 5.275893489926092e-05, + "loss": 1.4133, + "step": 9342 + }, + { + "epoch": 2.8359386856882685, + "grad_norm": 0.7309173941612244, + "learning_rate": 5.275387263339071e-05, + "loss": 1.4461, + "step": 9343 + }, + { + "epoch": 2.8362422218849597, + "grad_norm": 0.7526821494102478, + "learning_rate": 5.274881036752051e-05, + "loss": 1.5288, + "step": 9344 + }, + { + "epoch": 2.8365457580816513, + "grad_norm": 0.8191508650779724, + "learning_rate": 5.27437481016503e-05, + "loss": 1.4208, + "step": 9345 + }, + { + "epoch": 2.8368492942783425, + "grad_norm": 0.7021205425262451, + "learning_rate": 5.27386858357801e-05, + "loss": 1.02, + "step": 9346 + }, + { + "epoch": 2.837152830475034, + "grad_norm": 0.9767215847969055, + "learning_rate": 5.27336235699099e-05, + "loss": 1.3927, + "step": 9347 + }, + { + "epoch": 2.8374563666717254, + "grad_norm": 0.796252965927124, + "learning_rate": 5.272856130403969e-05, + "loss": 1.1261, + "step": 9348 + }, + { + "epoch": 2.837759902868417, + "grad_norm": 0.8417410850524902, + "learning_rate": 5.272349903816949e-05, + "loss": 1.3771, + "step": 9349 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.7039379477500916, + "learning_rate": 5.271843677229928e-05, + "loss": 1.3596, + "step": 9350 + }, + { + "epoch": 2.8383669752618, + "grad_norm": 0.9071443676948547, + "learning_rate": 5.271337450642908e-05, + "loss": 1.4462, + "step": 9351 + }, + { + "epoch": 2.8386705114584916, + "grad_norm": 0.7268051505088806, + "learning_rate": 5.270831224055888e-05, + "loss": 0.9868, + "step": 9352 + }, + { + "epoch": 2.838974047655183, + "grad_norm": 0.8409135341644287, + "learning_rate": 5.2703249974688674e-05, + "loss": 1.4759, + "step": 9353 + }, + { + "epoch": 2.8392775838518745, + "grad_norm": 0.7975213527679443, + "learning_rate": 5.269818770881847e-05, + "loss": 1.5096, + "step": 9354 + }, + { + "epoch": 2.8395811200485657, + "grad_norm": 0.9084259867668152, + "learning_rate": 5.2693125442948264e-05, + "loss": 1.4095, + "step": 9355 + }, + { + "epoch": 2.8398846562452573, + "grad_norm": 0.8844589591026306, + "learning_rate": 5.2688063177078065e-05, + "loss": 0.9897, + "step": 9356 + }, + { + "epoch": 2.840188192441949, + "grad_norm": 0.8133668899536133, + "learning_rate": 5.268300091120786e-05, + "loss": 1.4553, + "step": 9357 + }, + { + "epoch": 2.84049172863864, + "grad_norm": 0.695389449596405, + "learning_rate": 5.2677938645337655e-05, + "loss": 0.6254, + "step": 9358 + }, + { + "epoch": 2.8407952648353314, + "grad_norm": 0.6495389938354492, + "learning_rate": 5.267287637946745e-05, + "loss": 1.2818, + "step": 9359 + }, + { + "epoch": 2.841098801032023, + "grad_norm": 0.9445971250534058, + "learning_rate": 5.2667814113597244e-05, + "loss": 1.328, + "step": 9360 + }, + { + "epoch": 2.8414023372287147, + "grad_norm": 0.8059170246124268, + "learning_rate": 5.2662751847727046e-05, + "loss": 1.4942, + "step": 9361 + }, + { + "epoch": 2.841705873425406, + "grad_norm": 0.9014286994934082, + "learning_rate": 5.265768958185684e-05, + "loss": 1.045, + "step": 9362 + }, + { + "epoch": 2.8420094096220976, + "grad_norm": 0.7845970988273621, + "learning_rate": 5.2652627315986636e-05, + "loss": 1.3251, + "step": 9363 + }, + { + "epoch": 2.842312945818789, + "grad_norm": 0.8027483820915222, + "learning_rate": 5.264756505011643e-05, + "loss": 1.6816, + "step": 9364 + }, + { + "epoch": 2.8426164820154805, + "grad_norm": 0.7307506203651428, + "learning_rate": 5.2642502784246225e-05, + "loss": 1.4705, + "step": 9365 + }, + { + "epoch": 2.8429200182121717, + "grad_norm": 0.6807789206504822, + "learning_rate": 5.263744051837603e-05, + "loss": 0.9371, + "step": 9366 + }, + { + "epoch": 2.8432235544088633, + "grad_norm": 0.7624093890190125, + "learning_rate": 5.263237825250582e-05, + "loss": 1.4282, + "step": 9367 + }, + { + "epoch": 2.843527090605555, + "grad_norm": 0.7262867093086243, + "learning_rate": 5.2627315986635616e-05, + "loss": 0.9036, + "step": 9368 + }, + { + "epoch": 2.843830626802246, + "grad_norm": 0.8020523190498352, + "learning_rate": 5.262225372076541e-05, + "loss": 1.6797, + "step": 9369 + }, + { + "epoch": 2.8441341629989374, + "grad_norm": 0.790020227432251, + "learning_rate": 5.261719145489521e-05, + "loss": 1.4689, + "step": 9370 + }, + { + "epoch": 2.844437699195629, + "grad_norm": 0.5507739186286926, + "learning_rate": 5.261212918902501e-05, + "loss": 0.899, + "step": 9371 + }, + { + "epoch": 2.8447412353923207, + "grad_norm": 0.7228598594665527, + "learning_rate": 5.26070669231548e-05, + "loss": 1.2649, + "step": 9372 + }, + { + "epoch": 2.845044771589012, + "grad_norm": 0.7899906039237976, + "learning_rate": 5.26020046572846e-05, + "loss": 1.4408, + "step": 9373 + }, + { + "epoch": 2.8453483077857036, + "grad_norm": 0.8347296714782715, + "learning_rate": 5.259694239141439e-05, + "loss": 1.1176, + "step": 9374 + }, + { + "epoch": 2.845651843982395, + "grad_norm": 0.57683265209198, + "learning_rate": 5.2591880125544193e-05, + "loss": 1.049, + "step": 9375 + }, + { + "epoch": 2.8459553801790864, + "grad_norm": 0.7391901016235352, + "learning_rate": 5.2586817859673995e-05, + "loss": 1.6252, + "step": 9376 + }, + { + "epoch": 2.8462589163757777, + "grad_norm": 1.206459879875183, + "learning_rate": 5.2581755593803797e-05, + "loss": 1.1132, + "step": 9377 + }, + { + "epoch": 2.8465624525724693, + "grad_norm": 0.6464114189147949, + "learning_rate": 5.257669332793359e-05, + "loss": 1.23, + "step": 9378 + }, + { + "epoch": 2.846865988769161, + "grad_norm": 0.7888322472572327, + "learning_rate": 5.2571631062063386e-05, + "loss": 1.02, + "step": 9379 + }, + { + "epoch": 2.847169524965852, + "grad_norm": 0.8795018792152405, + "learning_rate": 5.256656879619318e-05, + "loss": 1.4062, + "step": 9380 + }, + { + "epoch": 2.8474730611625434, + "grad_norm": 0.7494449019432068, + "learning_rate": 5.256150653032298e-05, + "loss": 1.3383, + "step": 9381 + }, + { + "epoch": 2.847776597359235, + "grad_norm": 0.7852158546447754, + "learning_rate": 5.255644426445278e-05, + "loss": 1.3933, + "step": 9382 + }, + { + "epoch": 2.8480801335559267, + "grad_norm": 0.7150129079818726, + "learning_rate": 5.255138199858257e-05, + "loss": 1.4461, + "step": 9383 + }, + { + "epoch": 2.848383669752618, + "grad_norm": 0.664424479007721, + "learning_rate": 5.254631973271237e-05, + "loss": 0.7387, + "step": 9384 + }, + { + "epoch": 2.8486872059493096, + "grad_norm": 0.8127879500389099, + "learning_rate": 5.254125746684216e-05, + "loss": 0.7265, + "step": 9385 + }, + { + "epoch": 2.848990742146001, + "grad_norm": 0.7500486969947815, + "learning_rate": 5.253619520097196e-05, + "loss": 1.5836, + "step": 9386 + }, + { + "epoch": 2.8492942783426924, + "grad_norm": 0.8652605414390564, + "learning_rate": 5.253113293510176e-05, + "loss": 1.1918, + "step": 9387 + }, + { + "epoch": 2.8495978145393837, + "grad_norm": 0.7804118394851685, + "learning_rate": 5.252607066923155e-05, + "loss": 1.2734, + "step": 9388 + }, + { + "epoch": 2.8499013507360753, + "grad_norm": 0.8731774687767029, + "learning_rate": 5.252100840336135e-05, + "loss": 1.0997, + "step": 9389 + }, + { + "epoch": 2.850204886932767, + "grad_norm": 0.8509650230407715, + "learning_rate": 5.251594613749114e-05, + "loss": 1.265, + "step": 9390 + }, + { + "epoch": 2.850508423129458, + "grad_norm": 0.7483029961585999, + "learning_rate": 5.2510883871620944e-05, + "loss": 1.5143, + "step": 9391 + }, + { + "epoch": 2.8508119593261494, + "grad_norm": 0.7632007598876953, + "learning_rate": 5.250582160575074e-05, + "loss": 1.4062, + "step": 9392 + }, + { + "epoch": 2.851115495522841, + "grad_norm": 0.754703938961029, + "learning_rate": 5.2500759339880534e-05, + "loss": 1.6202, + "step": 9393 + }, + { + "epoch": 2.8514190317195327, + "grad_norm": 0.8680121302604675, + "learning_rate": 5.249569707401033e-05, + "loss": 1.1555, + "step": 9394 + }, + { + "epoch": 2.851722567916224, + "grad_norm": 0.8086943626403809, + "learning_rate": 5.249063480814013e-05, + "loss": 1.0458, + "step": 9395 + }, + { + "epoch": 2.8520261041129156, + "grad_norm": 0.6513628363609314, + "learning_rate": 5.2485572542269925e-05, + "loss": 1.037, + "step": 9396 + }, + { + "epoch": 2.852329640309607, + "grad_norm": 0.7701380848884583, + "learning_rate": 5.248051027639972e-05, + "loss": 1.1105, + "step": 9397 + }, + { + "epoch": 2.8526331765062984, + "grad_norm": 0.9460353851318359, + "learning_rate": 5.2475448010529514e-05, + "loss": 1.3801, + "step": 9398 + }, + { + "epoch": 2.8529367127029897, + "grad_norm": 0.7532127499580383, + "learning_rate": 5.247038574465931e-05, + "loss": 0.9574, + "step": 9399 + }, + { + "epoch": 2.8532402488996813, + "grad_norm": 0.8332433700561523, + "learning_rate": 5.246532347878911e-05, + "loss": 1.3094, + "step": 9400 + }, + { + "epoch": 2.853543785096373, + "grad_norm": 0.8725066184997559, + "learning_rate": 5.2460261212918906e-05, + "loss": 0.8585, + "step": 9401 + }, + { + "epoch": 2.853847321293064, + "grad_norm": 0.7810158133506775, + "learning_rate": 5.24551989470487e-05, + "loss": 1.4824, + "step": 9402 + }, + { + "epoch": 2.8541508574897554, + "grad_norm": 0.7571697235107422, + "learning_rate": 5.2450136681178495e-05, + "loss": 1.5745, + "step": 9403 + }, + { + "epoch": 2.854454393686447, + "grad_norm": 0.7732252478599548, + "learning_rate": 5.244507441530829e-05, + "loss": 1.6005, + "step": 9404 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.7137447595596313, + "learning_rate": 5.244001214943809e-05, + "loss": 1.4944, + "step": 9405 + }, + { + "epoch": 2.85506146607983, + "grad_norm": 0.9943833947181702, + "learning_rate": 5.2434949883567886e-05, + "loss": 1.02, + "step": 9406 + }, + { + "epoch": 2.8553650022765216, + "grad_norm": 0.5583171844482422, + "learning_rate": 5.242988761769768e-05, + "loss": 1.515, + "step": 9407 + }, + { + "epoch": 2.855668538473213, + "grad_norm": 0.7085838317871094, + "learning_rate": 5.2424825351827476e-05, + "loss": 0.9852, + "step": 9408 + }, + { + "epoch": 2.8559720746699044, + "grad_norm": 0.7515712976455688, + "learning_rate": 5.241976308595728e-05, + "loss": 1.5338, + "step": 9409 + }, + { + "epoch": 2.8562756108665956, + "grad_norm": 0.7197819948196411, + "learning_rate": 5.241470082008707e-05, + "loss": 1.6469, + "step": 9410 + }, + { + "epoch": 2.8565791470632873, + "grad_norm": 0.6142263412475586, + "learning_rate": 5.240963855421687e-05, + "loss": 1.9789, + "step": 9411 + }, + { + "epoch": 2.856882683259979, + "grad_norm": 0.9562734961509705, + "learning_rate": 5.240457628834666e-05, + "loss": 1.4231, + "step": 9412 + }, + { + "epoch": 2.85718621945667, + "grad_norm": 0.8005004525184631, + "learning_rate": 5.239951402247646e-05, + "loss": 1.4874, + "step": 9413 + }, + { + "epoch": 2.857489755653362, + "grad_norm": 0.8507754802703857, + "learning_rate": 5.239445175660626e-05, + "loss": 1.4512, + "step": 9414 + }, + { + "epoch": 2.857793291850053, + "grad_norm": 0.7643118500709534, + "learning_rate": 5.238938949073605e-05, + "loss": 1.1473, + "step": 9415 + }, + { + "epoch": 2.8580968280467447, + "grad_norm": 0.7403945922851562, + "learning_rate": 5.238432722486585e-05, + "loss": 1.5298, + "step": 9416 + }, + { + "epoch": 2.858400364243436, + "grad_norm": 0.8773601651191711, + "learning_rate": 5.237926495899564e-05, + "loss": 1.3813, + "step": 9417 + }, + { + "epoch": 2.8587039004401276, + "grad_norm": 0.5954939723014832, + "learning_rate": 5.237420269312544e-05, + "loss": 1.565, + "step": 9418 + }, + { + "epoch": 2.8590074366368188, + "grad_norm": 0.7224332690238953, + "learning_rate": 5.236914042725524e-05, + "loss": 0.9511, + "step": 9419 + }, + { + "epoch": 2.8593109728335104, + "grad_norm": 0.7605741024017334, + "learning_rate": 5.236407816138505e-05, + "loss": 1.3148, + "step": 9420 + }, + { + "epoch": 2.8596145090302016, + "grad_norm": 0.6351408362388611, + "learning_rate": 5.235901589551484e-05, + "loss": 1.1754, + "step": 9421 + }, + { + "epoch": 2.8599180452268933, + "grad_norm": 0.7478115558624268, + "learning_rate": 5.235395362964464e-05, + "loss": 1.5339, + "step": 9422 + }, + { + "epoch": 2.860221581423585, + "grad_norm": 0.726111888885498, + "learning_rate": 5.234889136377443e-05, + "loss": 1.124, + "step": 9423 + }, + { + "epoch": 2.860525117620276, + "grad_norm": 0.9777094125747681, + "learning_rate": 5.2343829097904226e-05, + "loss": 1.347, + "step": 9424 + }, + { + "epoch": 2.860828653816968, + "grad_norm": 0.7683424949645996, + "learning_rate": 5.233876683203403e-05, + "loss": 1.1178, + "step": 9425 + }, + { + "epoch": 2.861132190013659, + "grad_norm": 0.7628284096717834, + "learning_rate": 5.233370456616382e-05, + "loss": 1.433, + "step": 9426 + }, + { + "epoch": 2.8614357262103507, + "grad_norm": 0.7160748243331909, + "learning_rate": 5.232864230029362e-05, + "loss": 1.673, + "step": 9427 + }, + { + "epoch": 2.861739262407042, + "grad_norm": 0.6520998477935791, + "learning_rate": 5.232358003442341e-05, + "loss": 1.3751, + "step": 9428 + }, + { + "epoch": 2.8620427986037336, + "grad_norm": 0.7842011451721191, + "learning_rate": 5.231851776855321e-05, + "loss": 1.2587, + "step": 9429 + }, + { + "epoch": 2.862346334800425, + "grad_norm": 1.2248660326004028, + "learning_rate": 5.231345550268301e-05, + "loss": 0.9244, + "step": 9430 + }, + { + "epoch": 2.8626498709971164, + "grad_norm": 0.8425837159156799, + "learning_rate": 5.2308393236812804e-05, + "loss": 1.1696, + "step": 9431 + }, + { + "epoch": 2.8629534071938076, + "grad_norm": 0.7808385491371155, + "learning_rate": 5.23033309709426e-05, + "loss": 1.3076, + "step": 9432 + }, + { + "epoch": 2.8632569433904993, + "grad_norm": 0.8945707082748413, + "learning_rate": 5.229826870507239e-05, + "loss": 1.5213, + "step": 9433 + }, + { + "epoch": 2.863560479587191, + "grad_norm": 0.6636887192726135, + "learning_rate": 5.2293206439202195e-05, + "loss": 1.2744, + "step": 9434 + }, + { + "epoch": 2.863864015783882, + "grad_norm": 0.7467337846755981, + "learning_rate": 5.228814417333199e-05, + "loss": 1.3888, + "step": 9435 + }, + { + "epoch": 2.864167551980574, + "grad_norm": 0.685385525226593, + "learning_rate": 5.2283081907461784e-05, + "loss": 1.1128, + "step": 9436 + }, + { + "epoch": 2.864471088177265, + "grad_norm": 0.8079697489738464, + "learning_rate": 5.227801964159158e-05, + "loss": 1.4654, + "step": 9437 + }, + { + "epoch": 2.8647746243739567, + "grad_norm": 0.6123029589653015, + "learning_rate": 5.2272957375721374e-05, + "loss": 1.2247, + "step": 9438 + }, + { + "epoch": 2.865078160570648, + "grad_norm": 0.6182090640068054, + "learning_rate": 5.2267895109851175e-05, + "loss": 1.3447, + "step": 9439 + }, + { + "epoch": 2.8653816967673396, + "grad_norm": 0.7628608345985413, + "learning_rate": 5.226283284398097e-05, + "loss": 1.5927, + "step": 9440 + }, + { + "epoch": 2.865685232964031, + "grad_norm": 0.7146131992340088, + "learning_rate": 5.2257770578110765e-05, + "loss": 1.2454, + "step": 9441 + }, + { + "epoch": 2.8659887691607224, + "grad_norm": 0.9080950021743774, + "learning_rate": 5.225270831224056e-05, + "loss": 1.4425, + "step": 9442 + }, + { + "epoch": 2.8662923053574136, + "grad_norm": 0.8173847794532776, + "learning_rate": 5.2247646046370355e-05, + "loss": 1.0399, + "step": 9443 + }, + { + "epoch": 2.8665958415541053, + "grad_norm": 0.9067602157592773, + "learning_rate": 5.2242583780500156e-05, + "loss": 1.4186, + "step": 9444 + }, + { + "epoch": 2.866899377750797, + "grad_norm": 0.6521152257919312, + "learning_rate": 5.223752151462995e-05, + "loss": 1.3131, + "step": 9445 + }, + { + "epoch": 2.867202913947488, + "grad_norm": 0.8556538820266724, + "learning_rate": 5.2232459248759746e-05, + "loss": 1.3317, + "step": 9446 + }, + { + "epoch": 2.86750645014418, + "grad_norm": 0.7497937679290771, + "learning_rate": 5.222739698288954e-05, + "loss": 1.3419, + "step": 9447 + }, + { + "epoch": 2.867809986340871, + "grad_norm": 0.6333903074264526, + "learning_rate": 5.222233471701934e-05, + "loss": 1.5285, + "step": 9448 + }, + { + "epoch": 2.8681135225375627, + "grad_norm": 0.7968592643737793, + "learning_rate": 5.221727245114914e-05, + "loss": 1.4826, + "step": 9449 + }, + { + "epoch": 2.868417058734254, + "grad_norm": 0.7306660413742065, + "learning_rate": 5.221221018527893e-05, + "loss": 1.6084, + "step": 9450 + }, + { + "epoch": 2.8687205949309456, + "grad_norm": 0.7376747131347656, + "learning_rate": 5.2207147919408727e-05, + "loss": 1.6202, + "step": 9451 + }, + { + "epoch": 2.869024131127637, + "grad_norm": 0.5895881652832031, + "learning_rate": 5.220208565353852e-05, + "loss": 1.4214, + "step": 9452 + }, + { + "epoch": 2.8693276673243284, + "grad_norm": 0.7723854184150696, + "learning_rate": 5.219702338766832e-05, + "loss": 0.9924, + "step": 9453 + }, + { + "epoch": 2.8696312035210196, + "grad_norm": 0.6269218325614929, + "learning_rate": 5.219196112179812e-05, + "loss": 1.9052, + "step": 9454 + }, + { + "epoch": 2.8699347397177113, + "grad_norm": 0.7254565954208374, + "learning_rate": 5.218689885592791e-05, + "loss": 0.9028, + "step": 9455 + }, + { + "epoch": 2.870238275914403, + "grad_norm": 0.95447838306427, + "learning_rate": 5.218183659005771e-05, + "loss": 1.1353, + "step": 9456 + }, + { + "epoch": 2.870541812111094, + "grad_norm": 0.8039798736572266, + "learning_rate": 5.21767743241875e-05, + "loss": 1.1365, + "step": 9457 + }, + { + "epoch": 2.870845348307786, + "grad_norm": 0.7894598841667175, + "learning_rate": 5.2171712058317304e-05, + "loss": 1.268, + "step": 9458 + }, + { + "epoch": 2.871148884504477, + "grad_norm": 0.5049625039100647, + "learning_rate": 5.21666497924471e-05, + "loss": 0.98, + "step": 9459 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.6731208562850952, + "learning_rate": 5.216158752657689e-05, + "loss": 1.229, + "step": 9460 + }, + { + "epoch": 2.87175595689786, + "grad_norm": 0.8631210923194885, + "learning_rate": 5.215652526070669e-05, + "loss": 1.5062, + "step": 9461 + }, + { + "epoch": 2.8720594930945516, + "grad_norm": 0.7528581023216248, + "learning_rate": 5.215146299483649e-05, + "loss": 0.4431, + "step": 9462 + }, + { + "epoch": 2.872363029291243, + "grad_norm": 0.7947737574577332, + "learning_rate": 5.2146400728966284e-05, + "loss": 1.3533, + "step": 9463 + }, + { + "epoch": 2.8726665654879344, + "grad_norm": 0.8122506141662598, + "learning_rate": 5.214133846309608e-05, + "loss": 1.0848, + "step": 9464 + }, + { + "epoch": 2.8729701016846256, + "grad_norm": 0.8927140235900879, + "learning_rate": 5.213627619722589e-05, + "loss": 1.1561, + "step": 9465 + }, + { + "epoch": 2.8732736378813173, + "grad_norm": 0.8999928832054138, + "learning_rate": 5.213121393135568e-05, + "loss": 1.5133, + "step": 9466 + }, + { + "epoch": 2.873577174078009, + "grad_norm": 0.7747072577476501, + "learning_rate": 5.212615166548548e-05, + "loss": 1.4042, + "step": 9467 + }, + { + "epoch": 2.8738807102747, + "grad_norm": 0.7205396890640259, + "learning_rate": 5.212108939961527e-05, + "loss": 1.6049, + "step": 9468 + }, + { + "epoch": 2.874184246471392, + "grad_norm": 0.7850732803344727, + "learning_rate": 5.2116027133745074e-05, + "loss": 1.6288, + "step": 9469 + }, + { + "epoch": 2.874487782668083, + "grad_norm": 0.7782788276672363, + "learning_rate": 5.211096486787487e-05, + "loss": 1.443, + "step": 9470 + }, + { + "epoch": 2.8747913188647747, + "grad_norm": 0.8070952296257019, + "learning_rate": 5.210590260200466e-05, + "loss": 1.4859, + "step": 9471 + }, + { + "epoch": 2.875094855061466, + "grad_norm": 0.7127992510795593, + "learning_rate": 5.210084033613446e-05, + "loss": 1.7129, + "step": 9472 + }, + { + "epoch": 2.8753983912581575, + "grad_norm": 0.6095362305641174, + "learning_rate": 5.209577807026426e-05, + "loss": 1.4317, + "step": 9473 + }, + { + "epoch": 2.875701927454849, + "grad_norm": 0.9193838834762573, + "learning_rate": 5.2090715804394054e-05, + "loss": 1.4304, + "step": 9474 + }, + { + "epoch": 2.8760054636515404, + "grad_norm": 0.8403457403182983, + "learning_rate": 5.208565353852385e-05, + "loss": 1.372, + "step": 9475 + }, + { + "epoch": 2.8763089998482316, + "grad_norm": 0.8098099827766418, + "learning_rate": 5.2080591272653644e-05, + "loss": 0.6, + "step": 9476 + }, + { + "epoch": 2.8766125360449233, + "grad_norm": 0.9631059765815735, + "learning_rate": 5.207552900678344e-05, + "loss": 1.0892, + "step": 9477 + }, + { + "epoch": 2.876916072241615, + "grad_norm": 0.8415666222572327, + "learning_rate": 5.207046674091324e-05, + "loss": 1.2686, + "step": 9478 + }, + { + "epoch": 2.877219608438306, + "grad_norm": 0.6405428647994995, + "learning_rate": 5.2065404475043035e-05, + "loss": 1.1467, + "step": 9479 + }, + { + "epoch": 2.877523144634998, + "grad_norm": 0.7595462203025818, + "learning_rate": 5.206034220917283e-05, + "loss": 1.4007, + "step": 9480 + }, + { + "epoch": 2.877826680831689, + "grad_norm": 0.8996461033821106, + "learning_rate": 5.2055279943302625e-05, + "loss": 1.4708, + "step": 9481 + }, + { + "epoch": 2.8781302170283807, + "grad_norm": 0.7555029392242432, + "learning_rate": 5.205021767743242e-05, + "loss": 1.1784, + "step": 9482 + }, + { + "epoch": 2.878433753225072, + "grad_norm": 0.7749983668327332, + "learning_rate": 5.204515541156222e-05, + "loss": 1.1931, + "step": 9483 + }, + { + "epoch": 2.8787372894217635, + "grad_norm": 0.7814266681671143, + "learning_rate": 5.2040093145692016e-05, + "loss": 1.2516, + "step": 9484 + }, + { + "epoch": 2.879040825618455, + "grad_norm": 0.9828033447265625, + "learning_rate": 5.203503087982181e-05, + "loss": 1.3432, + "step": 9485 + }, + { + "epoch": 2.8793443618151464, + "grad_norm": 0.7643359303474426, + "learning_rate": 5.2029968613951605e-05, + "loss": 1.0592, + "step": 9486 + }, + { + "epoch": 2.879647898011838, + "grad_norm": 0.6870555877685547, + "learning_rate": 5.202490634808141e-05, + "loss": 1.6373, + "step": 9487 + }, + { + "epoch": 2.8799514342085293, + "grad_norm": 0.5364072918891907, + "learning_rate": 5.20198440822112e-05, + "loss": 1.8056, + "step": 9488 + }, + { + "epoch": 2.880254970405221, + "grad_norm": 0.6645576357841492, + "learning_rate": 5.2014781816340997e-05, + "loss": 0.8111, + "step": 9489 + }, + { + "epoch": 2.880558506601912, + "grad_norm": 0.7039901614189148, + "learning_rate": 5.200971955047079e-05, + "loss": 1.2746, + "step": 9490 + }, + { + "epoch": 2.880862042798604, + "grad_norm": 0.816798985004425, + "learning_rate": 5.2004657284600586e-05, + "loss": 1.5521, + "step": 9491 + }, + { + "epoch": 2.8811655789952955, + "grad_norm": 0.8264162540435791, + "learning_rate": 5.199959501873039e-05, + "loss": 0.9209, + "step": 9492 + }, + { + "epoch": 2.8814691151919867, + "grad_norm": 0.7069061994552612, + "learning_rate": 5.199453275286018e-05, + "loss": 1.0456, + "step": 9493 + }, + { + "epoch": 2.881772651388678, + "grad_norm": 0.7477339506149292, + "learning_rate": 5.198947048698998e-05, + "loss": 1.0461, + "step": 9494 + }, + { + "epoch": 2.8820761875853695, + "grad_norm": 0.783385157585144, + "learning_rate": 5.198440822111977e-05, + "loss": 1.3856, + "step": 9495 + }, + { + "epoch": 2.882379723782061, + "grad_norm": 0.6643350720405579, + "learning_rate": 5.197934595524957e-05, + "loss": 1.5486, + "step": 9496 + }, + { + "epoch": 2.8826832599787524, + "grad_norm": 0.6355939507484436, + "learning_rate": 5.197428368937937e-05, + "loss": 1.0737, + "step": 9497 + }, + { + "epoch": 2.882986796175444, + "grad_norm": 0.5952500700950623, + "learning_rate": 5.196922142350916e-05, + "loss": 1.7085, + "step": 9498 + }, + { + "epoch": 2.8832903323721353, + "grad_norm": 0.7849879860877991, + "learning_rate": 5.196415915763896e-05, + "loss": 1.4298, + "step": 9499 + }, + { + "epoch": 2.883593868568827, + "grad_norm": 0.5713488459587097, + "learning_rate": 5.195909689176875e-05, + "loss": 1.3922, + "step": 9500 + }, + { + "epoch": 2.883897404765518, + "grad_norm": 0.8283959627151489, + "learning_rate": 5.1954034625898554e-05, + "loss": 1.3905, + "step": 9501 + }, + { + "epoch": 2.88420094096221, + "grad_norm": 0.8133925795555115, + "learning_rate": 5.194897236002835e-05, + "loss": 1.4816, + "step": 9502 + }, + { + "epoch": 2.8845044771589015, + "grad_norm": 0.7605336904525757, + "learning_rate": 5.1943910094158144e-05, + "loss": 1.3117, + "step": 9503 + }, + { + "epoch": 2.8848080133555927, + "grad_norm": 0.9271584153175354, + "learning_rate": 5.193884782828794e-05, + "loss": 0.9625, + "step": 9504 + }, + { + "epoch": 2.885111549552284, + "grad_norm": 0.9125571250915527, + "learning_rate": 5.1933785562417734e-05, + "loss": 1.2425, + "step": 9505 + }, + { + "epoch": 2.8854150857489755, + "grad_norm": 0.8910593390464783, + "learning_rate": 5.1928723296547535e-05, + "loss": 1.3785, + "step": 9506 + }, + { + "epoch": 2.885718621945667, + "grad_norm": 1.0134199857711792, + "learning_rate": 5.192366103067733e-05, + "loss": 1.1262, + "step": 9507 + }, + { + "epoch": 2.8860221581423584, + "grad_norm": 0.8326638340950012, + "learning_rate": 5.1918598764807125e-05, + "loss": 1.6915, + "step": 9508 + }, + { + "epoch": 2.88632569433905, + "grad_norm": 0.965233564376831, + "learning_rate": 5.191353649893693e-05, + "loss": 1.2376, + "step": 9509 + }, + { + "epoch": 2.8866292305357413, + "grad_norm": 0.7267403602600098, + "learning_rate": 5.190847423306673e-05, + "loss": 1.6445, + "step": 9510 + }, + { + "epoch": 2.886932766732433, + "grad_norm": 0.7772924304008484, + "learning_rate": 5.190341196719652e-05, + "loss": 1.1801, + "step": 9511 + }, + { + "epoch": 2.887236302929124, + "grad_norm": 0.8382841348648071, + "learning_rate": 5.1898349701326324e-05, + "loss": 1.3439, + "step": 9512 + }, + { + "epoch": 2.887539839125816, + "grad_norm": 0.7924531698226929, + "learning_rate": 5.189328743545612e-05, + "loss": 0.9468, + "step": 9513 + }, + { + "epoch": 2.8878433753225075, + "grad_norm": 0.8812182545661926, + "learning_rate": 5.1888225169585914e-05, + "loss": 1.2985, + "step": 9514 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.7798461318016052, + "learning_rate": 5.188316290371571e-05, + "loss": 1.3977, + "step": 9515 + }, + { + "epoch": 2.88845044771589, + "grad_norm": 1.0166219472885132, + "learning_rate": 5.1878100637845503e-05, + "loss": 1.3889, + "step": 9516 + }, + { + "epoch": 2.8887539839125815, + "grad_norm": 0.8224981427192688, + "learning_rate": 5.1873038371975305e-05, + "loss": 1.2335, + "step": 9517 + }, + { + "epoch": 2.889057520109273, + "grad_norm": 0.8364986777305603, + "learning_rate": 5.18679761061051e-05, + "loss": 1.3545, + "step": 9518 + }, + { + "epoch": 2.8893610563059644, + "grad_norm": 0.8336874842643738, + "learning_rate": 5.1862913840234895e-05, + "loss": 1.4871, + "step": 9519 + }, + { + "epoch": 2.889664592502656, + "grad_norm": 0.778023362159729, + "learning_rate": 5.185785157436469e-05, + "loss": 1.4589, + "step": 9520 + }, + { + "epoch": 2.8899681286993473, + "grad_norm": 0.7665227055549622, + "learning_rate": 5.1852789308494484e-05, + "loss": 1.4066, + "step": 9521 + }, + { + "epoch": 2.890271664896039, + "grad_norm": 0.841337263584137, + "learning_rate": 5.1847727042624286e-05, + "loss": 1.3517, + "step": 9522 + }, + { + "epoch": 2.89057520109273, + "grad_norm": 0.8286343216896057, + "learning_rate": 5.184266477675408e-05, + "loss": 1.4334, + "step": 9523 + }, + { + "epoch": 2.890878737289422, + "grad_norm": 0.8397142291069031, + "learning_rate": 5.1837602510883875e-05, + "loss": 1.4458, + "step": 9524 + }, + { + "epoch": 2.8911822734861135, + "grad_norm": 1.0814837217330933, + "learning_rate": 5.183254024501367e-05, + "loss": 0.9643, + "step": 9525 + }, + { + "epoch": 2.8914858096828047, + "grad_norm": 0.7771674990653992, + "learning_rate": 5.182747797914347e-05, + "loss": 0.8834, + "step": 9526 + }, + { + "epoch": 2.891789345879496, + "grad_norm": 0.9093636274337769, + "learning_rate": 5.1822415713273266e-05, + "loss": 1.3708, + "step": 9527 + }, + { + "epoch": 2.8920928820761875, + "grad_norm": 0.8670371174812317, + "learning_rate": 5.181735344740306e-05, + "loss": 1.5083, + "step": 9528 + }, + { + "epoch": 2.892396418272879, + "grad_norm": 0.7888594269752502, + "learning_rate": 5.1812291181532856e-05, + "loss": 1.5658, + "step": 9529 + }, + { + "epoch": 2.8926999544695704, + "grad_norm": 0.668411910533905, + "learning_rate": 5.180722891566265e-05, + "loss": 1.7654, + "step": 9530 + }, + { + "epoch": 2.893003490666262, + "grad_norm": 0.86203932762146, + "learning_rate": 5.180216664979245e-05, + "loss": 1.1276, + "step": 9531 + }, + { + "epoch": 2.8933070268629533, + "grad_norm": 0.7828831076622009, + "learning_rate": 5.179710438392225e-05, + "loss": 1.5016, + "step": 9532 + }, + { + "epoch": 2.893610563059645, + "grad_norm": 0.8838964104652405, + "learning_rate": 5.179204211805204e-05, + "loss": 1.4392, + "step": 9533 + }, + { + "epoch": 2.893914099256336, + "grad_norm": 0.8105677366256714, + "learning_rate": 5.178697985218184e-05, + "loss": 0.9082, + "step": 9534 + }, + { + "epoch": 2.894217635453028, + "grad_norm": 0.748955488204956, + "learning_rate": 5.178191758631163e-05, + "loss": 1.5931, + "step": 9535 + }, + { + "epoch": 2.8945211716497194, + "grad_norm": 0.7240814566612244, + "learning_rate": 5.177685532044143e-05, + "loss": 1.2664, + "step": 9536 + }, + { + "epoch": 2.8948247078464107, + "grad_norm": 0.6561826467514038, + "learning_rate": 5.177179305457123e-05, + "loss": 1.114, + "step": 9537 + }, + { + "epoch": 2.895128244043102, + "grad_norm": 0.7741621732711792, + "learning_rate": 5.176673078870102e-05, + "loss": 1.4439, + "step": 9538 + }, + { + "epoch": 2.8954317802397935, + "grad_norm": 0.6334800720214844, + "learning_rate": 5.176166852283082e-05, + "loss": 1.8905, + "step": 9539 + }, + { + "epoch": 2.895735316436485, + "grad_norm": 0.7378382086753845, + "learning_rate": 5.175660625696062e-05, + "loss": 1.2193, + "step": 9540 + }, + { + "epoch": 2.8960388526331764, + "grad_norm": 0.7913770079612732, + "learning_rate": 5.1751543991090414e-05, + "loss": 0.9071, + "step": 9541 + }, + { + "epoch": 2.896342388829868, + "grad_norm": 0.7039569020271301, + "learning_rate": 5.174648172522021e-05, + "loss": 1.1217, + "step": 9542 + }, + { + "epoch": 2.8966459250265593, + "grad_norm": 0.9311305284500122, + "learning_rate": 5.1741419459350004e-05, + "loss": 1.5294, + "step": 9543 + }, + { + "epoch": 2.896949461223251, + "grad_norm": 0.7685890197753906, + "learning_rate": 5.17363571934798e-05, + "loss": 1.3128, + "step": 9544 + }, + { + "epoch": 2.897252997419942, + "grad_norm": 0.7525815367698669, + "learning_rate": 5.17312949276096e-05, + "loss": 0.8357, + "step": 9545 + }, + { + "epoch": 2.897556533616634, + "grad_norm": 0.9669501781463623, + "learning_rate": 5.1726232661739395e-05, + "loss": 1.1537, + "step": 9546 + }, + { + "epoch": 2.8978600698133254, + "grad_norm": 0.8898695111274719, + "learning_rate": 5.172117039586919e-05, + "loss": 1.4388, + "step": 9547 + }, + { + "epoch": 2.8981636060100167, + "grad_norm": 0.8049153685569763, + "learning_rate": 5.1716108129998984e-05, + "loss": 1.4729, + "step": 9548 + }, + { + "epoch": 2.8984671422067083, + "grad_norm": 0.8677496314048767, + "learning_rate": 5.171104586412878e-05, + "loss": 1.027, + "step": 9549 + }, + { + "epoch": 2.8987706784033995, + "grad_norm": 0.8706070184707642, + "learning_rate": 5.170598359825858e-05, + "loss": 1.5992, + "step": 9550 + }, + { + "epoch": 2.899074214600091, + "grad_norm": 1.0179812908172607, + "learning_rate": 5.1700921332388375e-05, + "loss": 1.4936, + "step": 9551 + }, + { + "epoch": 2.8993777507967824, + "grad_norm": 0.735970139503479, + "learning_rate": 5.169585906651817e-05, + "loss": 1.7096, + "step": 9552 + }, + { + "epoch": 2.899681286993474, + "grad_norm": 0.5985288023948669, + "learning_rate": 5.1690796800647965e-05, + "loss": 0.6456, + "step": 9553 + }, + { + "epoch": 2.8999848231901653, + "grad_norm": 0.6462855935096741, + "learning_rate": 5.168573453477777e-05, + "loss": 1.1709, + "step": 9554 + }, + { + "epoch": 2.900288359386857, + "grad_norm": 0.8171294331550598, + "learning_rate": 5.168067226890757e-05, + "loss": 1.4539, + "step": 9555 + }, + { + "epoch": 2.900591895583548, + "grad_norm": 1.063042163848877, + "learning_rate": 5.167561000303737e-05, + "loss": 1.1501, + "step": 9556 + }, + { + "epoch": 2.90089543178024, + "grad_norm": 0.7977851629257202, + "learning_rate": 5.1670547737167165e-05, + "loss": 1.3462, + "step": 9557 + }, + { + "epoch": 2.9011989679769314, + "grad_norm": 0.8036537766456604, + "learning_rate": 5.166548547129696e-05, + "loss": 1.4145, + "step": 9558 + }, + { + "epoch": 2.9015025041736227, + "grad_norm": 0.8069393038749695, + "learning_rate": 5.1660423205426754e-05, + "loss": 1.193, + "step": 9559 + }, + { + "epoch": 2.9018060403703143, + "grad_norm": 0.6604540348052979, + "learning_rate": 5.165536093955655e-05, + "loss": 1.3344, + "step": 9560 + }, + { + "epoch": 2.9021095765670055, + "grad_norm": 0.8737397193908691, + "learning_rate": 5.165029867368635e-05, + "loss": 1.1529, + "step": 9561 + }, + { + "epoch": 2.902413112763697, + "grad_norm": 0.8647302389144897, + "learning_rate": 5.1645236407816145e-05, + "loss": 1.1524, + "step": 9562 + }, + { + "epoch": 2.9027166489603884, + "grad_norm": 0.7809445261955261, + "learning_rate": 5.164017414194594e-05, + "loss": 1.4176, + "step": 9563 + }, + { + "epoch": 2.90302018515708, + "grad_norm": 0.700145959854126, + "learning_rate": 5.1635111876075735e-05, + "loss": 1.2164, + "step": 9564 + }, + { + "epoch": 2.9033237213537717, + "grad_norm": 0.9407699704170227, + "learning_rate": 5.1630049610205536e-05, + "loss": 1.4232, + "step": 9565 + }, + { + "epoch": 2.903627257550463, + "grad_norm": 0.9683791399002075, + "learning_rate": 5.162498734433533e-05, + "loss": 1.4847, + "step": 9566 + }, + { + "epoch": 2.903930793747154, + "grad_norm": 0.8153687119483948, + "learning_rate": 5.1619925078465126e-05, + "loss": 1.2574, + "step": 9567 + }, + { + "epoch": 2.904234329943846, + "grad_norm": 0.943260133266449, + "learning_rate": 5.161486281259492e-05, + "loss": 1.7097, + "step": 9568 + }, + { + "epoch": 2.9045378661405374, + "grad_norm": 0.84771329164505, + "learning_rate": 5.1609800546724716e-05, + "loss": 1.4961, + "step": 9569 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.6338463425636292, + "learning_rate": 5.160473828085452e-05, + "loss": 1.3485, + "step": 9570 + }, + { + "epoch": 2.9051449385339203, + "grad_norm": 0.6851577162742615, + "learning_rate": 5.159967601498431e-05, + "loss": 1.2563, + "step": 9571 + }, + { + "epoch": 2.9054484747306115, + "grad_norm": 0.8305641412734985, + "learning_rate": 5.159461374911411e-05, + "loss": 1.2146, + "step": 9572 + }, + { + "epoch": 2.905752010927303, + "grad_norm": 1.008200764656067, + "learning_rate": 5.15895514832439e-05, + "loss": 1.2212, + "step": 9573 + }, + { + "epoch": 2.9060555471239944, + "grad_norm": 0.8098403811454773, + "learning_rate": 5.1584489217373696e-05, + "loss": 1.5169, + "step": 9574 + }, + { + "epoch": 2.906359083320686, + "grad_norm": 0.7787508964538574, + "learning_rate": 5.15794269515035e-05, + "loss": 1.3337, + "step": 9575 + }, + { + "epoch": 2.9066626195173777, + "grad_norm": 0.8555715680122375, + "learning_rate": 5.157436468563329e-05, + "loss": 1.1114, + "step": 9576 + }, + { + "epoch": 2.906966155714069, + "grad_norm": 0.6763578653335571, + "learning_rate": 5.156930241976309e-05, + "loss": 1.5864, + "step": 9577 + }, + { + "epoch": 2.90726969191076, + "grad_norm": 0.824381947517395, + "learning_rate": 5.156424015389288e-05, + "loss": 1.5366, + "step": 9578 + }, + { + "epoch": 2.907573228107452, + "grad_norm": 0.8940673470497131, + "learning_rate": 5.1559177888022684e-05, + "loss": 1.1842, + "step": 9579 + }, + { + "epoch": 2.9078767643041434, + "grad_norm": 0.7460834383964539, + "learning_rate": 5.155411562215248e-05, + "loss": 1.3562, + "step": 9580 + }, + { + "epoch": 2.9081803005008346, + "grad_norm": 0.7965441942214966, + "learning_rate": 5.1549053356282273e-05, + "loss": 1.6128, + "step": 9581 + }, + { + "epoch": 2.9084838366975263, + "grad_norm": 0.6389833092689514, + "learning_rate": 5.154399109041207e-05, + "loss": 1.0002, + "step": 9582 + }, + { + "epoch": 2.9087873728942175, + "grad_norm": 0.7806882262229919, + "learning_rate": 5.153892882454186e-05, + "loss": 1.138, + "step": 9583 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.7312899231910706, + "learning_rate": 5.1533866558671665e-05, + "loss": 1.114, + "step": 9584 + }, + { + "epoch": 2.9093944452876004, + "grad_norm": 0.5638426542282104, + "learning_rate": 5.152880429280146e-05, + "loss": 0.6022, + "step": 9585 + }, + { + "epoch": 2.909697981484292, + "grad_norm": 0.8103323578834534, + "learning_rate": 5.1523742026931254e-05, + "loss": 1.4652, + "step": 9586 + }, + { + "epoch": 2.9100015176809837, + "grad_norm": 0.7764275074005127, + "learning_rate": 5.151867976106105e-05, + "loss": 1.3598, + "step": 9587 + }, + { + "epoch": 2.910305053877675, + "grad_norm": 0.8000783920288086, + "learning_rate": 5.1513617495190844e-05, + "loss": 1.4708, + "step": 9588 + }, + { + "epoch": 2.910608590074366, + "grad_norm": 0.9029366970062256, + "learning_rate": 5.1508555229320645e-05, + "loss": 1.4317, + "step": 9589 + }, + { + "epoch": 2.9109121262710578, + "grad_norm": 0.6857181191444397, + "learning_rate": 5.150349296345044e-05, + "loss": 1.5334, + "step": 9590 + }, + { + "epoch": 2.9112156624677494, + "grad_norm": 0.7986151576042175, + "learning_rate": 5.1498430697580235e-05, + "loss": 1.5455, + "step": 9591 + }, + { + "epoch": 2.9115191986644406, + "grad_norm": 0.5361661911010742, + "learning_rate": 5.149336843171003e-05, + "loss": 1.1083, + "step": 9592 + }, + { + "epoch": 2.9118227348611323, + "grad_norm": 0.6486913561820984, + "learning_rate": 5.148830616583983e-05, + "loss": 1.1699, + "step": 9593 + }, + { + "epoch": 2.9121262710578235, + "grad_norm": 0.8758334517478943, + "learning_rate": 5.1483243899969626e-05, + "loss": 1.1375, + "step": 9594 + }, + { + "epoch": 2.912429807254515, + "grad_norm": 0.8009307980537415, + "learning_rate": 5.147818163409942e-05, + "loss": 1.3781, + "step": 9595 + }, + { + "epoch": 2.9127333434512064, + "grad_norm": 0.8063926696777344, + "learning_rate": 5.1473119368229216e-05, + "loss": 1.3542, + "step": 9596 + }, + { + "epoch": 2.913036879647898, + "grad_norm": 0.6963333487510681, + "learning_rate": 5.146805710235901e-05, + "loss": 1.5677, + "step": 9597 + }, + { + "epoch": 2.9133404158445897, + "grad_norm": 0.7917062044143677, + "learning_rate": 5.146299483648882e-05, + "loss": 1.2204, + "step": 9598 + }, + { + "epoch": 2.913643952041281, + "grad_norm": 0.7042210698127747, + "learning_rate": 5.1457932570618614e-05, + "loss": 0.8323, + "step": 9599 + }, + { + "epoch": 2.913947488237972, + "grad_norm": 1.0165883302688599, + "learning_rate": 5.1452870304748415e-05, + "loss": 1.3022, + "step": 9600 + }, + { + "epoch": 2.9142510244346638, + "grad_norm": 0.8328631520271301, + "learning_rate": 5.144780803887821e-05, + "loss": 1.1785, + "step": 9601 + }, + { + "epoch": 2.9145545606313554, + "grad_norm": 0.9051679968833923, + "learning_rate": 5.1442745773008005e-05, + "loss": 1.2479, + "step": 9602 + }, + { + "epoch": 2.9148580968280466, + "grad_norm": 0.8050028085708618, + "learning_rate": 5.14376835071378e-05, + "loss": 1.8515, + "step": 9603 + }, + { + "epoch": 2.9151616330247383, + "grad_norm": 0.9980581402778625, + "learning_rate": 5.14326212412676e-05, + "loss": 0.6876, + "step": 9604 + }, + { + "epoch": 2.9154651692214295, + "grad_norm": 0.7276046276092529, + "learning_rate": 5.1427558975397396e-05, + "loss": 0.8367, + "step": 9605 + }, + { + "epoch": 2.915768705418121, + "grad_norm": 0.7519717812538147, + "learning_rate": 5.142249670952719e-05, + "loss": 1.4272, + "step": 9606 + }, + { + "epoch": 2.9160722416148124, + "grad_norm": 0.8044680953025818, + "learning_rate": 5.1417434443656986e-05, + "loss": 1.4931, + "step": 9607 + }, + { + "epoch": 2.916375777811504, + "grad_norm": 0.7219284176826477, + "learning_rate": 5.141237217778678e-05, + "loss": 1.0797, + "step": 9608 + }, + { + "epoch": 2.9166793140081957, + "grad_norm": 0.8786895871162415, + "learning_rate": 5.140730991191658e-05, + "loss": 1.2961, + "step": 9609 + }, + { + "epoch": 2.916982850204887, + "grad_norm": 0.7362821698188782, + "learning_rate": 5.140224764604638e-05, + "loss": 1.4744, + "step": 9610 + }, + { + "epoch": 2.917286386401578, + "grad_norm": 0.8390907049179077, + "learning_rate": 5.139718538017617e-05, + "loss": 1.4064, + "step": 9611 + }, + { + "epoch": 2.9175899225982698, + "grad_norm": 0.9276410341262817, + "learning_rate": 5.1392123114305966e-05, + "loss": 1.0092, + "step": 9612 + }, + { + "epoch": 2.9178934587949614, + "grad_norm": 0.8145900964736938, + "learning_rate": 5.138706084843576e-05, + "loss": 1.3435, + "step": 9613 + }, + { + "epoch": 2.9181969949916526, + "grad_norm": 0.7713693380355835, + "learning_rate": 5.138199858256556e-05, + "loss": 1.1803, + "step": 9614 + }, + { + "epoch": 2.9185005311883443, + "grad_norm": 0.8595828413963318, + "learning_rate": 5.137693631669536e-05, + "loss": 1.1999, + "step": 9615 + }, + { + "epoch": 2.9188040673850355, + "grad_norm": 0.7955202460289001, + "learning_rate": 5.137187405082515e-05, + "loss": 1.5164, + "step": 9616 + }, + { + "epoch": 2.919107603581727, + "grad_norm": 0.8825899362564087, + "learning_rate": 5.136681178495495e-05, + "loss": 1.4261, + "step": 9617 + }, + { + "epoch": 2.9194111397784184, + "grad_norm": 0.8596706986427307, + "learning_rate": 5.136174951908475e-05, + "loss": 0.5435, + "step": 9618 + }, + { + "epoch": 2.91971467597511, + "grad_norm": 0.6747449636459351, + "learning_rate": 5.1356687253214543e-05, + "loss": 1.5789, + "step": 9619 + }, + { + "epoch": 2.9200182121718017, + "grad_norm": 0.8745201826095581, + "learning_rate": 5.135162498734434e-05, + "loss": 1.148, + "step": 9620 + }, + { + "epoch": 2.920321748368493, + "grad_norm": 0.8869763612747192, + "learning_rate": 5.134656272147413e-05, + "loss": 1.367, + "step": 9621 + }, + { + "epoch": 2.9206252845651846, + "grad_norm": 0.7572410106658936, + "learning_rate": 5.134150045560393e-05, + "loss": 1.1945, + "step": 9622 + }, + { + "epoch": 2.9209288207618758, + "grad_norm": 0.8140220046043396, + "learning_rate": 5.133643818973373e-05, + "loss": 1.6378, + "step": 9623 + }, + { + "epoch": 2.9212323569585674, + "grad_norm": 0.8246815204620361, + "learning_rate": 5.1331375923863524e-05, + "loss": 1.3473, + "step": 9624 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.8982497453689575, + "learning_rate": 5.132631365799332e-05, + "loss": 1.2034, + "step": 9625 + }, + { + "epoch": 2.9218394293519503, + "grad_norm": 0.7786431312561035, + "learning_rate": 5.1321251392123114e-05, + "loss": 1.2714, + "step": 9626 + }, + { + "epoch": 2.922142965548642, + "grad_norm": 0.7802687883377075, + "learning_rate": 5.131618912625291e-05, + "loss": 1.2668, + "step": 9627 + }, + { + "epoch": 2.922446501745333, + "grad_norm": 0.632917046546936, + "learning_rate": 5.131112686038271e-05, + "loss": 1.2752, + "step": 9628 + }, + { + "epoch": 2.9227500379420244, + "grad_norm": 0.8978488445281982, + "learning_rate": 5.1306064594512505e-05, + "loss": 1.367, + "step": 9629 + }, + { + "epoch": 2.923053574138716, + "grad_norm": 0.8339922428131104, + "learning_rate": 5.13010023286423e-05, + "loss": 1.3381, + "step": 9630 + }, + { + "epoch": 2.9233571103354077, + "grad_norm": 0.8951248526573181, + "learning_rate": 5.1295940062772095e-05, + "loss": 1.242, + "step": 9631 + }, + { + "epoch": 2.923660646532099, + "grad_norm": 0.8673502802848816, + "learning_rate": 5.1290877796901896e-05, + "loss": 1.0581, + "step": 9632 + }, + { + "epoch": 2.9239641827287906, + "grad_norm": 0.7304636240005493, + "learning_rate": 5.128581553103169e-05, + "loss": 1.5523, + "step": 9633 + }, + { + "epoch": 2.9242677189254818, + "grad_norm": 0.8997844457626343, + "learning_rate": 5.1280753265161486e-05, + "loss": 1.4083, + "step": 9634 + }, + { + "epoch": 2.9245712551221734, + "grad_norm": 0.5966005921363831, + "learning_rate": 5.127569099929128e-05, + "loss": 1.6588, + "step": 9635 + }, + { + "epoch": 2.9248747913188646, + "grad_norm": 0.6863671541213989, + "learning_rate": 5.1270628733421075e-05, + "loss": 1.0795, + "step": 9636 + }, + { + "epoch": 2.9251783275155563, + "grad_norm": 0.947727382183075, + "learning_rate": 5.126556646755088e-05, + "loss": 0.9771, + "step": 9637 + }, + { + "epoch": 2.925481863712248, + "grad_norm": 0.5955187678337097, + "learning_rate": 5.126050420168067e-05, + "loss": 0.8562, + "step": 9638 + }, + { + "epoch": 2.925785399908939, + "grad_norm": 0.5938629508018494, + "learning_rate": 5.1255441935810466e-05, + "loss": 1.4456, + "step": 9639 + }, + { + "epoch": 2.9260889361056304, + "grad_norm": 0.9371960759162903, + "learning_rate": 5.125037966994026e-05, + "loss": 0.9832, + "step": 9640 + }, + { + "epoch": 2.926392472302322, + "grad_norm": 0.7487984299659729, + "learning_rate": 5.1245317404070056e-05, + "loss": 1.4711, + "step": 9641 + }, + { + "epoch": 2.9266960084990137, + "grad_norm": 0.6870548725128174, + "learning_rate": 5.124025513819986e-05, + "loss": 1.4693, + "step": 9642 + }, + { + "epoch": 2.926999544695705, + "grad_norm": 0.7630167007446289, + "learning_rate": 5.1235192872329666e-05, + "loss": 1.3906, + "step": 9643 + }, + { + "epoch": 2.9273030808923965, + "grad_norm": 0.8437421321868896, + "learning_rate": 5.123013060645946e-05, + "loss": 1.7142, + "step": 9644 + }, + { + "epoch": 2.9276066170890878, + "grad_norm": 0.8551376461982727, + "learning_rate": 5.1225068340589256e-05, + "loss": 1.4621, + "step": 9645 + }, + { + "epoch": 2.9279101532857794, + "grad_norm": 1.0669502019882202, + "learning_rate": 5.122000607471905e-05, + "loss": 0.9127, + "step": 9646 + }, + { + "epoch": 2.9282136894824706, + "grad_norm": 0.6872658133506775, + "learning_rate": 5.1214943808848845e-05, + "loss": 1.4721, + "step": 9647 + }, + { + "epoch": 2.9285172256791623, + "grad_norm": 0.6480333209037781, + "learning_rate": 5.120988154297865e-05, + "loss": 1.2266, + "step": 9648 + }, + { + "epoch": 2.928820761875854, + "grad_norm": 0.7079026103019714, + "learning_rate": 5.120481927710844e-05, + "loss": 1.6364, + "step": 9649 + }, + { + "epoch": 2.929124298072545, + "grad_norm": 0.7798513174057007, + "learning_rate": 5.1199757011238236e-05, + "loss": 0.5933, + "step": 9650 + }, + { + "epoch": 2.9294278342692364, + "grad_norm": 0.7039276957511902, + "learning_rate": 5.119469474536803e-05, + "loss": 1.2404, + "step": 9651 + }, + { + "epoch": 2.929731370465928, + "grad_norm": 0.8349506855010986, + "learning_rate": 5.1189632479497826e-05, + "loss": 1.2046, + "step": 9652 + }, + { + "epoch": 2.9300349066626197, + "grad_norm": 0.7625461220741272, + "learning_rate": 5.118457021362763e-05, + "loss": 1.2918, + "step": 9653 + }, + { + "epoch": 2.930338442859311, + "grad_norm": 0.8089103698730469, + "learning_rate": 5.117950794775742e-05, + "loss": 1.4058, + "step": 9654 + }, + { + "epoch": 2.9306419790560025, + "grad_norm": 0.601629376411438, + "learning_rate": 5.117444568188722e-05, + "loss": 1.4394, + "step": 9655 + }, + { + "epoch": 2.9309455152526938, + "grad_norm": 0.7368852496147156, + "learning_rate": 5.116938341601701e-05, + "loss": 1.4202, + "step": 9656 + }, + { + "epoch": 2.9312490514493854, + "grad_norm": 0.5684367418289185, + "learning_rate": 5.1164321150146813e-05, + "loss": 1.8344, + "step": 9657 + }, + { + "epoch": 2.9315525876460766, + "grad_norm": 0.8074974417686462, + "learning_rate": 5.115925888427661e-05, + "loss": 1.5921, + "step": 9658 + }, + { + "epoch": 2.9318561238427683, + "grad_norm": 0.8737492561340332, + "learning_rate": 5.11541966184064e-05, + "loss": 1.026, + "step": 9659 + }, + { + "epoch": 2.93215966003946, + "grad_norm": 0.7643964290618896, + "learning_rate": 5.11491343525362e-05, + "loss": 1.4965, + "step": 9660 + }, + { + "epoch": 2.932463196236151, + "grad_norm": 0.7347926497459412, + "learning_rate": 5.114407208666599e-05, + "loss": 1.5798, + "step": 9661 + }, + { + "epoch": 2.9327667324328424, + "grad_norm": 0.7421404123306274, + "learning_rate": 5.1139009820795794e-05, + "loss": 1.815, + "step": 9662 + }, + { + "epoch": 2.933070268629534, + "grad_norm": 0.7707992196083069, + "learning_rate": 5.113394755492559e-05, + "loss": 1.2817, + "step": 9663 + }, + { + "epoch": 2.9333738048262257, + "grad_norm": 0.6986004710197449, + "learning_rate": 5.1128885289055384e-05, + "loss": 1.4769, + "step": 9664 + }, + { + "epoch": 2.933677341022917, + "grad_norm": 0.8465421199798584, + "learning_rate": 5.112382302318518e-05, + "loss": 1.5801, + "step": 9665 + }, + { + "epoch": 2.9339808772196085, + "grad_norm": 0.7518223524093628, + "learning_rate": 5.111876075731497e-05, + "loss": 1.5368, + "step": 9666 + }, + { + "epoch": 2.9342844134162998, + "grad_norm": 0.7431496381759644, + "learning_rate": 5.1113698491444775e-05, + "loss": 1.7001, + "step": 9667 + }, + { + "epoch": 2.9345879496129914, + "grad_norm": 0.8584191203117371, + "learning_rate": 5.110863622557457e-05, + "loss": 0.7157, + "step": 9668 + }, + { + "epoch": 2.9348914858096826, + "grad_norm": 0.7151068449020386, + "learning_rate": 5.1103573959704364e-05, + "loss": 1.6295, + "step": 9669 + }, + { + "epoch": 2.9351950220063743, + "grad_norm": 0.8421808481216431, + "learning_rate": 5.109851169383416e-05, + "loss": 1.4686, + "step": 9670 + }, + { + "epoch": 2.935498558203066, + "grad_norm": 0.6444612741470337, + "learning_rate": 5.109344942796396e-05, + "loss": 0.8179, + "step": 9671 + }, + { + "epoch": 2.935802094399757, + "grad_norm": 0.6792292594909668, + "learning_rate": 5.1088387162093756e-05, + "loss": 1.4475, + "step": 9672 + }, + { + "epoch": 2.9361056305964484, + "grad_norm": 1.006290316581726, + "learning_rate": 5.108332489622355e-05, + "loss": 1.0705, + "step": 9673 + }, + { + "epoch": 2.93640916679314, + "grad_norm": 0.8089848160743713, + "learning_rate": 5.1078262630353345e-05, + "loss": 1.1588, + "step": 9674 + }, + { + "epoch": 2.9367127029898317, + "grad_norm": 0.7995975017547607, + "learning_rate": 5.107320036448314e-05, + "loss": 1.4301, + "step": 9675 + }, + { + "epoch": 2.937016239186523, + "grad_norm": 0.6885451674461365, + "learning_rate": 5.106813809861294e-05, + "loss": 1.4127, + "step": 9676 + }, + { + "epoch": 2.9373197753832145, + "grad_norm": 0.754417896270752, + "learning_rate": 5.1063075832742736e-05, + "loss": 0.9511, + "step": 9677 + }, + { + "epoch": 2.9376233115799057, + "grad_norm": 0.6752740144729614, + "learning_rate": 5.105801356687253e-05, + "loss": 1.0616, + "step": 9678 + }, + { + "epoch": 2.9379268477765974, + "grad_norm": 0.7772539854049683, + "learning_rate": 5.1052951301002326e-05, + "loss": 1.4327, + "step": 9679 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.8099876046180725, + "learning_rate": 5.104788903513212e-05, + "loss": 1.4187, + "step": 9680 + }, + { + "epoch": 2.9385339201699803, + "grad_norm": 0.9773611426353455, + "learning_rate": 5.104282676926192e-05, + "loss": 0.9895, + "step": 9681 + }, + { + "epoch": 2.938837456366672, + "grad_norm": 0.9407110810279846, + "learning_rate": 5.103776450339172e-05, + "loss": 1.0509, + "step": 9682 + }, + { + "epoch": 2.939140992563363, + "grad_norm": 0.7487874031066895, + "learning_rate": 5.103270223752151e-05, + "loss": 1.1588, + "step": 9683 + }, + { + "epoch": 2.939444528760055, + "grad_norm": 0.8135343790054321, + "learning_rate": 5.102763997165131e-05, + "loss": 1.4319, + "step": 9684 + }, + { + "epoch": 2.939748064956746, + "grad_norm": 0.7436039447784424, + "learning_rate": 5.102257770578111e-05, + "loss": 1.279, + "step": 9685 + }, + { + "epoch": 2.9400516011534377, + "grad_norm": 0.6971882581710815, + "learning_rate": 5.10175154399109e-05, + "loss": 1.3123, + "step": 9686 + }, + { + "epoch": 2.940355137350129, + "grad_norm": 0.7899006009101868, + "learning_rate": 5.101245317404071e-05, + "loss": 1.2072, + "step": 9687 + }, + { + "epoch": 2.9406586735468205, + "grad_norm": 0.685184121131897, + "learning_rate": 5.1007390908170506e-05, + "loss": 1.2253, + "step": 9688 + }, + { + "epoch": 2.940962209743512, + "grad_norm": 1.092652678489685, + "learning_rate": 5.10023286423003e-05, + "loss": 1.2048, + "step": 9689 + }, + { + "epoch": 2.9412657459402034, + "grad_norm": 0.8485744595527649, + "learning_rate": 5.0997266376430096e-05, + "loss": 1.0684, + "step": 9690 + }, + { + "epoch": 2.9415692821368946, + "grad_norm": 0.724848747253418, + "learning_rate": 5.099220411055989e-05, + "loss": 0.9208, + "step": 9691 + }, + { + "epoch": 2.9418728183335863, + "grad_norm": 0.9698922634124756, + "learning_rate": 5.098714184468969e-05, + "loss": 1.0005, + "step": 9692 + }, + { + "epoch": 2.942176354530278, + "grad_norm": 0.8260499835014343, + "learning_rate": 5.098207957881949e-05, + "loss": 1.6745, + "step": 9693 + }, + { + "epoch": 2.942479890726969, + "grad_norm": 0.775804340839386, + "learning_rate": 5.097701731294928e-05, + "loss": 1.5064, + "step": 9694 + }, + { + "epoch": 2.942783426923661, + "grad_norm": 0.8762521743774414, + "learning_rate": 5.0971955047079077e-05, + "loss": 1.4192, + "step": 9695 + }, + { + "epoch": 2.943086963120352, + "grad_norm": 0.9298811554908752, + "learning_rate": 5.096689278120888e-05, + "loss": 1.1981, + "step": 9696 + }, + { + "epoch": 2.9433904993170437, + "grad_norm": 0.8426903486251831, + "learning_rate": 5.096183051533867e-05, + "loss": 1.4391, + "step": 9697 + }, + { + "epoch": 2.943694035513735, + "grad_norm": 0.8833054900169373, + "learning_rate": 5.095676824946847e-05, + "loss": 0.7843, + "step": 9698 + }, + { + "epoch": 2.9439975717104265, + "grad_norm": 0.774456799030304, + "learning_rate": 5.095170598359826e-05, + "loss": 1.5328, + "step": 9699 + }, + { + "epoch": 2.944301107907118, + "grad_norm": 0.8480775356292725, + "learning_rate": 5.094664371772806e-05, + "loss": 1.3889, + "step": 9700 + }, + { + "epoch": 2.9446046441038094, + "grad_norm": 0.8695796132087708, + "learning_rate": 5.094158145185786e-05, + "loss": 1.5308, + "step": 9701 + }, + { + "epoch": 2.9449081803005006, + "grad_norm": 0.8271726369857788, + "learning_rate": 5.0936519185987654e-05, + "loss": 1.5792, + "step": 9702 + }, + { + "epoch": 2.9452117164971923, + "grad_norm": 0.4747374653816223, + "learning_rate": 5.093145692011745e-05, + "loss": 1.243, + "step": 9703 + }, + { + "epoch": 2.945515252693884, + "grad_norm": 1.3622267246246338, + "learning_rate": 5.092639465424724e-05, + "loss": 1.1071, + "step": 9704 + }, + { + "epoch": 2.945818788890575, + "grad_norm": 0.7314203977584839, + "learning_rate": 5.092133238837704e-05, + "loss": 1.1546, + "step": 9705 + }, + { + "epoch": 2.946122325087267, + "grad_norm": 0.8458970785140991, + "learning_rate": 5.091627012250684e-05, + "loss": 1.1194, + "step": 9706 + }, + { + "epoch": 2.946425861283958, + "grad_norm": 0.7926846742630005, + "learning_rate": 5.0911207856636634e-05, + "loss": 1.4845, + "step": 9707 + }, + { + "epoch": 2.9467293974806497, + "grad_norm": 0.8043208122253418, + "learning_rate": 5.090614559076643e-05, + "loss": 1.3745, + "step": 9708 + }, + { + "epoch": 2.947032933677341, + "grad_norm": 0.7073126435279846, + "learning_rate": 5.0901083324896224e-05, + "loss": 1.6979, + "step": 9709 + }, + { + "epoch": 2.9473364698740325, + "grad_norm": 0.9214633703231812, + "learning_rate": 5.0896021059026026e-05, + "loss": 1.2375, + "step": 9710 + }, + { + "epoch": 2.947640006070724, + "grad_norm": 0.8063821196556091, + "learning_rate": 5.089095879315582e-05, + "loss": 1.6225, + "step": 9711 + }, + { + "epoch": 2.9479435422674154, + "grad_norm": 0.9422792792320251, + "learning_rate": 5.0885896527285615e-05, + "loss": 1.3707, + "step": 9712 + }, + { + "epoch": 2.9482470784641066, + "grad_norm": 0.7002964019775391, + "learning_rate": 5.088083426141541e-05, + "loss": 1.3171, + "step": 9713 + }, + { + "epoch": 2.9485506146607983, + "grad_norm": 0.715103268623352, + "learning_rate": 5.0875771995545205e-05, + "loss": 1.3321, + "step": 9714 + }, + { + "epoch": 2.94885415085749, + "grad_norm": 0.7000694274902344, + "learning_rate": 5.0870709729675006e-05, + "loss": 1.7362, + "step": 9715 + }, + { + "epoch": 2.949157687054181, + "grad_norm": 0.6491566896438599, + "learning_rate": 5.08656474638048e-05, + "loss": 1.4304, + "step": 9716 + }, + { + "epoch": 2.949461223250873, + "grad_norm": 0.8144864439964294, + "learning_rate": 5.0860585197934596e-05, + "loss": 1.3623, + "step": 9717 + }, + { + "epoch": 2.949764759447564, + "grad_norm": 0.9529521465301514, + "learning_rate": 5.085552293206439e-05, + "loss": 1.3465, + "step": 9718 + }, + { + "epoch": 2.9500682956442557, + "grad_norm": 0.6080120205879211, + "learning_rate": 5.0850460666194186e-05, + "loss": 1.6269, + "step": 9719 + }, + { + "epoch": 2.950371831840947, + "grad_norm": 0.8070117235183716, + "learning_rate": 5.084539840032399e-05, + "loss": 1.1286, + "step": 9720 + }, + { + "epoch": 2.9506753680376385, + "grad_norm": 0.809084415435791, + "learning_rate": 5.084033613445378e-05, + "loss": 1.3545, + "step": 9721 + }, + { + "epoch": 2.95097890423433, + "grad_norm": 0.8189207315444946, + "learning_rate": 5.083527386858358e-05, + "loss": 1.1994, + "step": 9722 + }, + { + "epoch": 2.9512824404310214, + "grad_norm": 0.8860880732536316, + "learning_rate": 5.083021160271337e-05, + "loss": 1.3388, + "step": 9723 + }, + { + "epoch": 2.9515859766277126, + "grad_norm": 0.7872827053070068, + "learning_rate": 5.082514933684317e-05, + "loss": 1.1414, + "step": 9724 + }, + { + "epoch": 2.9518895128244043, + "grad_norm": 0.6410046815872192, + "learning_rate": 5.082008707097297e-05, + "loss": 0.9255, + "step": 9725 + }, + { + "epoch": 2.952193049021096, + "grad_norm": 0.7178161144256592, + "learning_rate": 5.081502480510276e-05, + "loss": 1.3847, + "step": 9726 + }, + { + "epoch": 2.952496585217787, + "grad_norm": 0.7075746059417725, + "learning_rate": 5.080996253923256e-05, + "loss": 1.5949, + "step": 9727 + }, + { + "epoch": 2.952800121414479, + "grad_norm": 0.8325478434562683, + "learning_rate": 5.080490027336235e-05, + "loss": 1.283, + "step": 9728 + }, + { + "epoch": 2.95310365761117, + "grad_norm": 0.7608327865600586, + "learning_rate": 5.0799838007492154e-05, + "loss": 1.3592, + "step": 9729 + }, + { + "epoch": 2.9534071938078617, + "grad_norm": 0.7346392273902893, + "learning_rate": 5.079477574162195e-05, + "loss": 1.5977, + "step": 9730 + }, + { + "epoch": 2.953710730004553, + "grad_norm": 0.8925238251686096, + "learning_rate": 5.0789713475751743e-05, + "loss": 1.3908, + "step": 9731 + }, + { + "epoch": 2.9540142662012445, + "grad_norm": 0.727159321308136, + "learning_rate": 5.078465120988155e-05, + "loss": 0.8633, + "step": 9732 + }, + { + "epoch": 2.954317802397936, + "grad_norm": 0.7768084406852722, + "learning_rate": 5.0779588944011347e-05, + "loss": 0.7428, + "step": 9733 + }, + { + "epoch": 2.9546213385946274, + "grad_norm": 0.7659957408905029, + "learning_rate": 5.077452667814114e-05, + "loss": 1.1148, + "step": 9734 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.8714812397956848, + "learning_rate": 5.076946441227094e-05, + "loss": 1.4103, + "step": 9735 + }, + { + "epoch": 2.9552284109880103, + "grad_norm": 0.7689177393913269, + "learning_rate": 5.076440214640074e-05, + "loss": 1.0334, + "step": 9736 + }, + { + "epoch": 2.955531947184702, + "grad_norm": 0.9407566785812378, + "learning_rate": 5.075933988053053e-05, + "loss": 1.3897, + "step": 9737 + }, + { + "epoch": 2.955835483381393, + "grad_norm": 0.7912222743034363, + "learning_rate": 5.075427761466033e-05, + "loss": 1.2953, + "step": 9738 + }, + { + "epoch": 2.956139019578085, + "grad_norm": 0.8030030727386475, + "learning_rate": 5.074921534879012e-05, + "loss": 1.4195, + "step": 9739 + }, + { + "epoch": 2.956442555774776, + "grad_norm": 0.8280852437019348, + "learning_rate": 5.0744153082919924e-05, + "loss": 0.9818, + "step": 9740 + }, + { + "epoch": 2.9567460919714676, + "grad_norm": 0.7365915775299072, + "learning_rate": 5.073909081704972e-05, + "loss": 1.7192, + "step": 9741 + }, + { + "epoch": 2.957049628168159, + "grad_norm": 0.6640022397041321, + "learning_rate": 5.073402855117951e-05, + "loss": 0.7376, + "step": 9742 + }, + { + "epoch": 2.9573531643648505, + "grad_norm": 0.9272693991661072, + "learning_rate": 5.072896628530931e-05, + "loss": 1.4146, + "step": 9743 + }, + { + "epoch": 2.957656700561542, + "grad_norm": 0.7189601063728333, + "learning_rate": 5.07239040194391e-05, + "loss": 1.3483, + "step": 9744 + }, + { + "epoch": 2.9579602367582334, + "grad_norm": 0.8663241267204285, + "learning_rate": 5.0718841753568904e-05, + "loss": 1.1175, + "step": 9745 + }, + { + "epoch": 2.9582637729549246, + "grad_norm": 0.7305809855461121, + "learning_rate": 5.07137794876987e-05, + "loss": 1.5778, + "step": 9746 + }, + { + "epoch": 2.9585673091516163, + "grad_norm": 0.7029976844787598, + "learning_rate": 5.0708717221828494e-05, + "loss": 1.2949, + "step": 9747 + }, + { + "epoch": 2.958870845348308, + "grad_norm": 0.5989913940429688, + "learning_rate": 5.070365495595829e-05, + "loss": 1.3814, + "step": 9748 + }, + { + "epoch": 2.959174381544999, + "grad_norm": 0.7960646152496338, + "learning_rate": 5.069859269008809e-05, + "loss": 1.6714, + "step": 9749 + }, + { + "epoch": 2.9594779177416908, + "grad_norm": 0.7953382730484009, + "learning_rate": 5.0693530424217885e-05, + "loss": 1.0393, + "step": 9750 + }, + { + "epoch": 2.959781453938382, + "grad_norm": 0.5406391620635986, + "learning_rate": 5.068846815834768e-05, + "loss": 1.0639, + "step": 9751 + }, + { + "epoch": 2.9600849901350736, + "grad_norm": 0.7291198372840881, + "learning_rate": 5.0683405892477475e-05, + "loss": 0.8658, + "step": 9752 + }, + { + "epoch": 2.960388526331765, + "grad_norm": 0.870521605014801, + "learning_rate": 5.067834362660727e-05, + "loss": 1.5244, + "step": 9753 + }, + { + "epoch": 2.9606920625284565, + "grad_norm": 0.7896977066993713, + "learning_rate": 5.067328136073707e-05, + "loss": 1.8605, + "step": 9754 + }, + { + "epoch": 2.960995598725148, + "grad_norm": 0.8984799385070801, + "learning_rate": 5.0668219094866866e-05, + "loss": 0.8763, + "step": 9755 + }, + { + "epoch": 2.9612991349218394, + "grad_norm": 0.8268417119979858, + "learning_rate": 5.066315682899666e-05, + "loss": 1.4706, + "step": 9756 + }, + { + "epoch": 2.961602671118531, + "grad_norm": 0.8731755018234253, + "learning_rate": 5.0658094563126455e-05, + "loss": 1.3214, + "step": 9757 + }, + { + "epoch": 2.9619062073152223, + "grad_norm": 0.8329451084136963, + "learning_rate": 5.065303229725625e-05, + "loss": 1.4599, + "step": 9758 + }, + { + "epoch": 2.962209743511914, + "grad_norm": 0.9944485425949097, + "learning_rate": 5.064797003138605e-05, + "loss": 1.2268, + "step": 9759 + }, + { + "epoch": 2.962513279708605, + "grad_norm": 0.8748295307159424, + "learning_rate": 5.064290776551585e-05, + "loss": 1.5343, + "step": 9760 + }, + { + "epoch": 2.9628168159052968, + "grad_norm": 0.734396755695343, + "learning_rate": 5.063784549964564e-05, + "loss": 0.916, + "step": 9761 + }, + { + "epoch": 2.9631203521019884, + "grad_norm": 1.1190379858016968, + "learning_rate": 5.0632783233775436e-05, + "loss": 1.0479, + "step": 9762 + }, + { + "epoch": 2.9634238882986796, + "grad_norm": 0.6809061169624329, + "learning_rate": 5.062772096790524e-05, + "loss": 1.6105, + "step": 9763 + }, + { + "epoch": 2.963727424495371, + "grad_norm": 0.895285964012146, + "learning_rate": 5.062265870203503e-05, + "loss": 1.1615, + "step": 9764 + }, + { + "epoch": 2.9640309606920625, + "grad_norm": 0.8676573634147644, + "learning_rate": 5.061759643616483e-05, + "loss": 1.2873, + "step": 9765 + }, + { + "epoch": 2.964334496888754, + "grad_norm": 0.9381932616233826, + "learning_rate": 5.061253417029462e-05, + "loss": 1.4655, + "step": 9766 + }, + { + "epoch": 2.9646380330854454, + "grad_norm": 0.7882171869277954, + "learning_rate": 5.060747190442442e-05, + "loss": 1.1012, + "step": 9767 + }, + { + "epoch": 2.964941569282137, + "grad_norm": 0.7521642446517944, + "learning_rate": 5.060240963855422e-05, + "loss": 0.8543, + "step": 9768 + }, + { + "epoch": 2.9652451054788282, + "grad_norm": 0.6908265352249146, + "learning_rate": 5.059734737268401e-05, + "loss": 1.6366, + "step": 9769 + }, + { + "epoch": 2.96554864167552, + "grad_norm": 0.8038427829742432, + "learning_rate": 5.059228510681381e-05, + "loss": 1.4779, + "step": 9770 + }, + { + "epoch": 2.965852177872211, + "grad_norm": 0.7121046781539917, + "learning_rate": 5.05872228409436e-05, + "loss": 1.0363, + "step": 9771 + }, + { + "epoch": 2.9661557140689028, + "grad_norm": 0.8224244117736816, + "learning_rate": 5.05821605750734e-05, + "loss": 1.2618, + "step": 9772 + }, + { + "epoch": 2.9664592502655944, + "grad_norm": 0.9318447113037109, + "learning_rate": 5.05770983092032e-05, + "loss": 1.4278, + "step": 9773 + }, + { + "epoch": 2.9667627864622856, + "grad_norm": 0.8800509572029114, + "learning_rate": 5.0572036043332994e-05, + "loss": 1.1878, + "step": 9774 + }, + { + "epoch": 2.967066322658977, + "grad_norm": 0.6945512294769287, + "learning_rate": 5.056697377746279e-05, + "loss": 1.4063, + "step": 9775 + }, + { + "epoch": 2.9673698588556685, + "grad_norm": 0.850615918636322, + "learning_rate": 5.05619115115926e-05, + "loss": 1.6237, + "step": 9776 + }, + { + "epoch": 2.96767339505236, + "grad_norm": 0.7711875438690186, + "learning_rate": 5.055684924572239e-05, + "loss": 1.3161, + "step": 9777 + }, + { + "epoch": 2.9679769312490514, + "grad_norm": 0.9047555923461914, + "learning_rate": 5.055178697985219e-05, + "loss": 1.4571, + "step": 9778 + }, + { + "epoch": 2.968280467445743, + "grad_norm": 0.8342523574829102, + "learning_rate": 5.054672471398199e-05, + "loss": 1.4259, + "step": 9779 + }, + { + "epoch": 2.9685840036424342, + "grad_norm": 0.7356086373329163, + "learning_rate": 5.054166244811178e-05, + "loss": 1.7979, + "step": 9780 + }, + { + "epoch": 2.968887539839126, + "grad_norm": 0.8583924770355225, + "learning_rate": 5.053660018224158e-05, + "loss": 1.6187, + "step": 9781 + }, + { + "epoch": 2.969191076035817, + "grad_norm": 0.6908389925956726, + "learning_rate": 5.053153791637137e-05, + "loss": 1.7816, + "step": 9782 + }, + { + "epoch": 2.9694946122325088, + "grad_norm": 0.7292888760566711, + "learning_rate": 5.052647565050117e-05, + "loss": 1.6162, + "step": 9783 + }, + { + "epoch": 2.9697981484292004, + "grad_norm": 0.7569924592971802, + "learning_rate": 5.052141338463097e-05, + "loss": 1.4642, + "step": 9784 + }, + { + "epoch": 2.9701016846258916, + "grad_norm": 0.9216084480285645, + "learning_rate": 5.0516351118760764e-05, + "loss": 1.4773, + "step": 9785 + }, + { + "epoch": 2.970405220822583, + "grad_norm": 1.0300320386886597, + "learning_rate": 5.051128885289056e-05, + "loss": 0.8327, + "step": 9786 + }, + { + "epoch": 2.9707087570192745, + "grad_norm": 0.6576038002967834, + "learning_rate": 5.0506226587020354e-05, + "loss": 1.2105, + "step": 9787 + }, + { + "epoch": 2.971012293215966, + "grad_norm": 0.812941312789917, + "learning_rate": 5.0501164321150155e-05, + "loss": 1.156, + "step": 9788 + }, + { + "epoch": 2.9713158294126574, + "grad_norm": 0.71174156665802, + "learning_rate": 5.049610205527995e-05, + "loss": 1.2858, + "step": 9789 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.7057084441184998, + "learning_rate": 5.0491039789409745e-05, + "loss": 0.8433, + "step": 9790 + }, + { + "epoch": 2.9719229018060402, + "grad_norm": 0.9108096957206726, + "learning_rate": 5.048597752353954e-05, + "loss": 1.0238, + "step": 9791 + }, + { + "epoch": 2.972226438002732, + "grad_norm": 0.8809316158294678, + "learning_rate": 5.0480915257669334e-05, + "loss": 1.2879, + "step": 9792 + }, + { + "epoch": 2.972529974199423, + "grad_norm": 0.7910594940185547, + "learning_rate": 5.0475852991799136e-05, + "loss": 1.4888, + "step": 9793 + }, + { + "epoch": 2.9728335103961148, + "grad_norm": 0.713735044002533, + "learning_rate": 5.047079072592893e-05, + "loss": 1.5353, + "step": 9794 + }, + { + "epoch": 2.9731370465928064, + "grad_norm": 0.829972505569458, + "learning_rate": 5.0465728460058725e-05, + "loss": 1.075, + "step": 9795 + }, + { + "epoch": 2.9734405827894976, + "grad_norm": 0.8665021657943726, + "learning_rate": 5.046066619418852e-05, + "loss": 1.4595, + "step": 9796 + }, + { + "epoch": 2.973744118986189, + "grad_norm": 0.7877386808395386, + "learning_rate": 5.0455603928318315e-05, + "loss": 1.5096, + "step": 9797 + }, + { + "epoch": 2.9740476551828805, + "grad_norm": 0.7499648928642273, + "learning_rate": 5.0450541662448117e-05, + "loss": 1.3817, + "step": 9798 + }, + { + "epoch": 2.974351191379572, + "grad_norm": 0.6918804049491882, + "learning_rate": 5.044547939657791e-05, + "loss": 1.2928, + "step": 9799 + }, + { + "epoch": 2.9746547275762634, + "grad_norm": 0.647549033164978, + "learning_rate": 5.0440417130707706e-05, + "loss": 1.339, + "step": 9800 + }, + { + "epoch": 2.974958263772955, + "grad_norm": 0.8811267614364624, + "learning_rate": 5.04353548648375e-05, + "loss": 1.1343, + "step": 9801 + }, + { + "epoch": 2.9752617999696462, + "grad_norm": 0.8627746105194092, + "learning_rate": 5.04302925989673e-05, + "loss": 1.1734, + "step": 9802 + }, + { + "epoch": 2.975565336166338, + "grad_norm": 0.5508565306663513, + "learning_rate": 5.04252303330971e-05, + "loss": 0.8058, + "step": 9803 + }, + { + "epoch": 2.975868872363029, + "grad_norm": 0.832922637462616, + "learning_rate": 5.042016806722689e-05, + "loss": 1.4417, + "step": 9804 + }, + { + "epoch": 2.9761724085597208, + "grad_norm": 0.6043367981910706, + "learning_rate": 5.041510580135669e-05, + "loss": 1.534, + "step": 9805 + }, + { + "epoch": 2.9764759447564124, + "grad_norm": 0.7686224579811096, + "learning_rate": 5.041004353548648e-05, + "loss": 1.2629, + "step": 9806 + }, + { + "epoch": 2.9767794809531036, + "grad_norm": 0.697522759437561, + "learning_rate": 5.040498126961628e-05, + "loss": 1.4228, + "step": 9807 + }, + { + "epoch": 2.977083017149795, + "grad_norm": 0.6715698838233948, + "learning_rate": 5.039991900374608e-05, + "loss": 1.5535, + "step": 9808 + }, + { + "epoch": 2.9773865533464865, + "grad_norm": 0.8603852391242981, + "learning_rate": 5.039485673787587e-05, + "loss": 1.0935, + "step": 9809 + }, + { + "epoch": 2.977690089543178, + "grad_norm": 0.7181248664855957, + "learning_rate": 5.038979447200567e-05, + "loss": 1.4258, + "step": 9810 + }, + { + "epoch": 2.9779936257398694, + "grad_norm": 0.9570068120956421, + "learning_rate": 5.038473220613546e-05, + "loss": 1.2175, + "step": 9811 + }, + { + "epoch": 2.978297161936561, + "grad_norm": 0.8859447240829468, + "learning_rate": 5.0379669940265264e-05, + "loss": 1.2012, + "step": 9812 + }, + { + "epoch": 2.9786006981332522, + "grad_norm": 0.7729933857917786, + "learning_rate": 5.037460767439506e-05, + "loss": 1.5303, + "step": 9813 + }, + { + "epoch": 2.978904234329944, + "grad_norm": 0.8890818357467651, + "learning_rate": 5.0369545408524854e-05, + "loss": 1.3881, + "step": 9814 + }, + { + "epoch": 2.979207770526635, + "grad_norm": 1.0364371538162231, + "learning_rate": 5.036448314265465e-05, + "loss": 1.4075, + "step": 9815 + }, + { + "epoch": 2.9795113067233268, + "grad_norm": 0.6747880578041077, + "learning_rate": 5.035942087678445e-05, + "loss": 0.9969, + "step": 9816 + }, + { + "epoch": 2.9798148429200184, + "grad_norm": 0.8688072562217712, + "learning_rate": 5.0354358610914245e-05, + "loss": 1.5621, + "step": 9817 + }, + { + "epoch": 2.9801183791167096, + "grad_norm": 0.8358421921730042, + "learning_rate": 5.034929634504404e-05, + "loss": 1.4759, + "step": 9818 + }, + { + "epoch": 2.9804219153134013, + "grad_norm": 0.7541760206222534, + "learning_rate": 5.0344234079173834e-05, + "loss": 0.7761, + "step": 9819 + }, + { + "epoch": 2.9807254515100925, + "grad_norm": 0.6867856979370117, + "learning_rate": 5.033917181330363e-05, + "loss": 1.726, + "step": 9820 + }, + { + "epoch": 2.981028987706784, + "grad_norm": 0.6554029583930969, + "learning_rate": 5.033410954743344e-05, + "loss": 1.0531, + "step": 9821 + }, + { + "epoch": 2.9813325239034754, + "grad_norm": 0.8928065896034241, + "learning_rate": 5.032904728156323e-05, + "loss": 0.8594, + "step": 9822 + }, + { + "epoch": 2.981636060100167, + "grad_norm": 0.8254424929618835, + "learning_rate": 5.0323985015693034e-05, + "loss": 1.4491, + "step": 9823 + }, + { + "epoch": 2.9819395962968587, + "grad_norm": 0.8873692750930786, + "learning_rate": 5.031892274982283e-05, + "loss": 1.3563, + "step": 9824 + }, + { + "epoch": 2.98224313249355, + "grad_norm": 0.8543142676353455, + "learning_rate": 5.0313860483952623e-05, + "loss": 1.2437, + "step": 9825 + }, + { + "epoch": 2.982546668690241, + "grad_norm": 0.8088618516921997, + "learning_rate": 5.030879821808242e-05, + "loss": 1.5201, + "step": 9826 + }, + { + "epoch": 2.9828502048869328, + "grad_norm": 0.7048816680908203, + "learning_rate": 5.030373595221221e-05, + "loss": 1.72, + "step": 9827 + }, + { + "epoch": 2.9831537410836244, + "grad_norm": 0.9001889228820801, + "learning_rate": 5.0298673686342015e-05, + "loss": 1.1851, + "step": 9828 + }, + { + "epoch": 2.9834572772803156, + "grad_norm": 0.7622965574264526, + "learning_rate": 5.029361142047181e-05, + "loss": 1.3831, + "step": 9829 + }, + { + "epoch": 2.9837608134770073, + "grad_norm": 0.8391376733779907, + "learning_rate": 5.0288549154601604e-05, + "loss": 1.4729, + "step": 9830 + }, + { + "epoch": 2.9840643496736985, + "grad_norm": 0.9648115038871765, + "learning_rate": 5.02834868887314e-05, + "loss": 1.0646, + "step": 9831 + }, + { + "epoch": 2.98436788587039, + "grad_norm": 0.6593850255012512, + "learning_rate": 5.02784246228612e-05, + "loss": 1.5846, + "step": 9832 + }, + { + "epoch": 2.9846714220670814, + "grad_norm": 0.8097171187400818, + "learning_rate": 5.0273362356990995e-05, + "loss": 1.0068, + "step": 9833 + }, + { + "epoch": 2.984974958263773, + "grad_norm": 0.5215747356414795, + "learning_rate": 5.026830009112079e-05, + "loss": 1.2893, + "step": 9834 + }, + { + "epoch": 2.9852784944604647, + "grad_norm": 0.824794352054596, + "learning_rate": 5.0263237825250585e-05, + "loss": 1.5381, + "step": 9835 + }, + { + "epoch": 2.985582030657156, + "grad_norm": 0.6967061161994934, + "learning_rate": 5.025817555938038e-05, + "loss": 1.3467, + "step": 9836 + }, + { + "epoch": 2.985885566853847, + "grad_norm": 0.7044780254364014, + "learning_rate": 5.025311329351018e-05, + "loss": 1.3884, + "step": 9837 + }, + { + "epoch": 2.9861891030505388, + "grad_norm": 0.7709345817565918, + "learning_rate": 5.0248051027639976e-05, + "loss": 1.0223, + "step": 9838 + }, + { + "epoch": 2.9864926392472304, + "grad_norm": 0.7379102110862732, + "learning_rate": 5.024298876176977e-05, + "loss": 0.8531, + "step": 9839 + }, + { + "epoch": 2.9867961754439216, + "grad_norm": 0.8930834531784058, + "learning_rate": 5.0237926495899566e-05, + "loss": 1.3216, + "step": 9840 + }, + { + "epoch": 2.9870997116406133, + "grad_norm": 0.9065871834754944, + "learning_rate": 5.023286423002937e-05, + "loss": 1.1774, + "step": 9841 + }, + { + "epoch": 2.9874032478373045, + "grad_norm": 0.8405609726905823, + "learning_rate": 5.022780196415916e-05, + "loss": 0.9348, + "step": 9842 + }, + { + "epoch": 2.987706784033996, + "grad_norm": 0.7697071433067322, + "learning_rate": 5.022273969828896e-05, + "loss": 1.4181, + "step": 9843 + }, + { + "epoch": 2.9880103202306874, + "grad_norm": 0.7778381705284119, + "learning_rate": 5.021767743241875e-05, + "loss": 1.706, + "step": 9844 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.7781450152397156, + "learning_rate": 5.0212615166548546e-05, + "loss": 1.2356, + "step": 9845 + }, + { + "epoch": 2.9886173926240707, + "grad_norm": 0.8738018870353699, + "learning_rate": 5.020755290067835e-05, + "loss": 1.4512, + "step": 9846 + }, + { + "epoch": 2.988920928820762, + "grad_norm": 0.9768234491348267, + "learning_rate": 5.020249063480814e-05, + "loss": 1.471, + "step": 9847 + }, + { + "epoch": 2.989224465017453, + "grad_norm": 0.8589128255844116, + "learning_rate": 5.019742836893794e-05, + "loss": 1.0007, + "step": 9848 + }, + { + "epoch": 2.9895280012141447, + "grad_norm": 0.8861180543899536, + "learning_rate": 5.019236610306773e-05, + "loss": 1.2349, + "step": 9849 + }, + { + "epoch": 2.9898315374108364, + "grad_norm": 0.8022412657737732, + "learning_rate": 5.018730383719753e-05, + "loss": 1.1733, + "step": 9850 + }, + { + "epoch": 2.9901350736075276, + "grad_norm": 0.875621497631073, + "learning_rate": 5.018224157132733e-05, + "loss": 1.6376, + "step": 9851 + }, + { + "epoch": 2.9904386098042193, + "grad_norm": 0.7509286403656006, + "learning_rate": 5.0177179305457124e-05, + "loss": 1.2079, + "step": 9852 + }, + { + "epoch": 2.9907421460009105, + "grad_norm": 0.8753302097320557, + "learning_rate": 5.017211703958692e-05, + "loss": 1.34, + "step": 9853 + }, + { + "epoch": 2.991045682197602, + "grad_norm": 0.7396566271781921, + "learning_rate": 5.016705477371671e-05, + "loss": 1.4037, + "step": 9854 + }, + { + "epoch": 2.9913492183942934, + "grad_norm": 0.6811649203300476, + "learning_rate": 5.0161992507846515e-05, + "loss": 1.7496, + "step": 9855 + }, + { + "epoch": 2.991652754590985, + "grad_norm": 0.6608646512031555, + "learning_rate": 5.015693024197631e-05, + "loss": 1.5362, + "step": 9856 + }, + { + "epoch": 2.9919562907876767, + "grad_norm": 0.7470353841781616, + "learning_rate": 5.0151867976106104e-05, + "loss": 1.1998, + "step": 9857 + }, + { + "epoch": 2.992259826984368, + "grad_norm": 0.7401373982429504, + "learning_rate": 5.01468057102359e-05, + "loss": 1.4646, + "step": 9858 + }, + { + "epoch": 2.992563363181059, + "grad_norm": 0.7070319652557373, + "learning_rate": 5.0141743444365694e-05, + "loss": 1.0402, + "step": 9859 + }, + { + "epoch": 2.9928668993777507, + "grad_norm": 0.7485304474830627, + "learning_rate": 5.0136681178495496e-05, + "loss": 1.1112, + "step": 9860 + }, + { + "epoch": 2.9931704355744424, + "grad_norm": 0.7815187573432922, + "learning_rate": 5.013161891262529e-05, + "loss": 1.2715, + "step": 9861 + }, + { + "epoch": 2.9934739717711336, + "grad_norm": 0.6393333077430725, + "learning_rate": 5.0126556646755085e-05, + "loss": 1.2626, + "step": 9862 + }, + { + "epoch": 2.9937775079678253, + "grad_norm": 0.7579580545425415, + "learning_rate": 5.012149438088488e-05, + "loss": 1.3921, + "step": 9863 + }, + { + "epoch": 2.9940810441645165, + "grad_norm": 0.8907715678215027, + "learning_rate": 5.0116432115014675e-05, + "loss": 1.1476, + "step": 9864 + }, + { + "epoch": 2.994384580361208, + "grad_norm": 0.7173703908920288, + "learning_rate": 5.011136984914448e-05, + "loss": 1.5022, + "step": 9865 + }, + { + "epoch": 2.9946881165578993, + "grad_norm": 0.8615228533744812, + "learning_rate": 5.010630758327428e-05, + "loss": 1.3103, + "step": 9866 + }, + { + "epoch": 2.994991652754591, + "grad_norm": 0.9548215866088867, + "learning_rate": 5.010124531740408e-05, + "loss": 1.3148, + "step": 9867 + }, + { + "epoch": 2.9952951889512827, + "grad_norm": 0.7956007122993469, + "learning_rate": 5.0096183051533874e-05, + "loss": 1.1208, + "step": 9868 + }, + { + "epoch": 2.995598725147974, + "grad_norm": 1.16354501247406, + "learning_rate": 5.009112078566367e-05, + "loss": 0.8042, + "step": 9869 + }, + { + "epoch": 2.995902261344665, + "grad_norm": 1.0169782638549805, + "learning_rate": 5.0086058519793464e-05, + "loss": 1.308, + "step": 9870 + }, + { + "epoch": 2.9962057975413567, + "grad_norm": 0.8002044558525085, + "learning_rate": 5.0080996253923265e-05, + "loss": 1.6956, + "step": 9871 + }, + { + "epoch": 2.9965093337380484, + "grad_norm": 0.7645939588546753, + "learning_rate": 5.007593398805306e-05, + "loss": 0.7603, + "step": 9872 + }, + { + "epoch": 2.9968128699347396, + "grad_norm": 0.7191788554191589, + "learning_rate": 5.0070871722182855e-05, + "loss": 1.4339, + "step": 9873 + }, + { + "epoch": 2.9971164061314313, + "grad_norm": 0.7696192264556885, + "learning_rate": 5.006580945631265e-05, + "loss": 1.593, + "step": 9874 + }, + { + "epoch": 2.9974199423281225, + "grad_norm": 0.9107735753059387, + "learning_rate": 5.0060747190442445e-05, + "loss": 1.2468, + "step": 9875 + }, + { + "epoch": 2.997723478524814, + "grad_norm": 0.7734604477882385, + "learning_rate": 5.0055684924572246e-05, + "loss": 0.857, + "step": 9876 + }, + { + "epoch": 2.9980270147215053, + "grad_norm": 0.6583893895149231, + "learning_rate": 5.005062265870204e-05, + "loss": 1.7402, + "step": 9877 + }, + { + "epoch": 2.998330550918197, + "grad_norm": 0.800261914730072, + "learning_rate": 5.0045560392831836e-05, + "loss": 1.6387, + "step": 9878 + }, + { + "epoch": 2.9986340871148887, + "grad_norm": 0.8133311867713928, + "learning_rate": 5.004049812696163e-05, + "loss": 1.3963, + "step": 9879 + }, + { + "epoch": 2.99893762331158, + "grad_norm": 0.5534855723381042, + "learning_rate": 5.0035435861091425e-05, + "loss": 1.6639, + "step": 9880 + }, + { + "epoch": 2.9992411595082715, + "grad_norm": 0.7287002801895142, + "learning_rate": 5.003037359522123e-05, + "loss": 1.4862, + "step": 9881 + }, + { + "epoch": 2.9995446957049627, + "grad_norm": 0.6345083117485046, + "learning_rate": 5.002531132935102e-05, + "loss": 1.1753, + "step": 9882 + }, + { + "epoch": 2.9998482319016544, + "grad_norm": 0.850407600402832, + "learning_rate": 5.0020249063480816e-05, + "loss": 1.2063, + "step": 9883 + }, + { + "epoch": 3.0001517680983456, + "grad_norm": 0.6992197632789612, + "learning_rate": 5.001518679761061e-05, + "loss": 0.9919, + "step": 9884 + }, + { + "epoch": 3.0004553042950373, + "grad_norm": 0.6799697279930115, + "learning_rate": 5.001012453174041e-05, + "loss": 0.6358, + "step": 9885 + }, + { + "epoch": 3.0007588404917285, + "grad_norm": 0.6441646814346313, + "learning_rate": 5.000506226587021e-05, + "loss": 1.6449, + "step": 9886 + }, + { + "epoch": 3.00106237668842, + "grad_norm": 0.8453091979026794, + "learning_rate": 5e-05, + "loss": 1.3043, + "step": 9887 + }, + { + "epoch": 3.0013659128851113, + "grad_norm": 0.6195229291915894, + "learning_rate": 4.99949377341298e-05, + "loss": 0.4865, + "step": 9888 + }, + { + "epoch": 3.001669449081803, + "grad_norm": 0.8482832312583923, + "learning_rate": 4.998987546825959e-05, + "loss": 0.9909, + "step": 9889 + }, + { + "epoch": 3.0019729852784947, + "grad_norm": 1.1187256574630737, + "learning_rate": 4.9984813202389394e-05, + "loss": 0.86, + "step": 9890 + }, + { + "epoch": 3.002276521475186, + "grad_norm": 0.922222375869751, + "learning_rate": 4.997975093651919e-05, + "loss": 1.1761, + "step": 9891 + }, + { + "epoch": 3.0025800576718775, + "grad_norm": 1.2568992376327515, + "learning_rate": 4.997468867064898e-05, + "loss": 0.9785, + "step": 9892 + }, + { + "epoch": 3.0028835938685687, + "grad_norm": 1.0342175960540771, + "learning_rate": 4.996962640477878e-05, + "loss": 0.6732, + "step": 9893 + }, + { + "epoch": 3.0031871300652604, + "grad_norm": 1.0421677827835083, + "learning_rate": 4.996456413890858e-05, + "loss": 1.3249, + "step": 9894 + }, + { + "epoch": 3.0034906662619516, + "grad_norm": 0.9691305160522461, + "learning_rate": 4.9959501873038374e-05, + "loss": 0.6156, + "step": 9895 + }, + { + "epoch": 3.0037942024586433, + "grad_norm": 0.9591747522354126, + "learning_rate": 4.995443960716817e-05, + "loss": 0.4681, + "step": 9896 + }, + { + "epoch": 3.0040977386553345, + "grad_norm": 0.973501443862915, + "learning_rate": 4.9949377341297964e-05, + "loss": 1.3864, + "step": 9897 + }, + { + "epoch": 3.004401274852026, + "grad_norm": 0.5497210621833801, + "learning_rate": 4.994431507542776e-05, + "loss": 0.7692, + "step": 9898 + }, + { + "epoch": 3.0047048110487173, + "grad_norm": 0.9023900032043457, + "learning_rate": 4.993925280955756e-05, + "loss": 0.5685, + "step": 9899 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.945500373840332, + "learning_rate": 4.993419054368736e-05, + "loss": 0.9917, + "step": 9900 + }, + { + "epoch": 3.0053118834421007, + "grad_norm": 0.7900440692901611, + "learning_rate": 4.992912827781716e-05, + "loss": 0.7036, + "step": 9901 + }, + { + "epoch": 3.005615419638792, + "grad_norm": 0.9429001808166504, + "learning_rate": 4.992406601194695e-05, + "loss": 1.2002, + "step": 9902 + }, + { + "epoch": 3.0059189558354835, + "grad_norm": 1.0113555192947388, + "learning_rate": 4.9919003746076746e-05, + "loss": 0.7924, + "step": 9903 + }, + { + "epoch": 3.0062224920321747, + "grad_norm": 0.727717936038971, + "learning_rate": 4.991394148020654e-05, + "loss": 1.268, + "step": 9904 + }, + { + "epoch": 3.0065260282288664, + "grad_norm": 0.9647108912467957, + "learning_rate": 4.990887921433634e-05, + "loss": 1.102, + "step": 9905 + }, + { + "epoch": 3.0068295644255576, + "grad_norm": 0.8775750398635864, + "learning_rate": 4.990381694846614e-05, + "loss": 0.738, + "step": 9906 + }, + { + "epoch": 3.0071331006222493, + "grad_norm": 0.8506520390510559, + "learning_rate": 4.989875468259593e-05, + "loss": 0.8632, + "step": 9907 + }, + { + "epoch": 3.0074366368189405, + "grad_norm": 1.0059034824371338, + "learning_rate": 4.989369241672573e-05, + "loss": 1.0357, + "step": 9908 + }, + { + "epoch": 3.007740173015632, + "grad_norm": 1.262754201889038, + "learning_rate": 4.988863015085553e-05, + "loss": 0.6071, + "step": 9909 + }, + { + "epoch": 3.008043709212324, + "grad_norm": 1.321282148361206, + "learning_rate": 4.988356788498532e-05, + "loss": 0.7738, + "step": 9910 + }, + { + "epoch": 3.008347245409015, + "grad_norm": 0.8013333082199097, + "learning_rate": 4.987850561911512e-05, + "loss": 1.4922, + "step": 9911 + }, + { + "epoch": 3.0086507816057066, + "grad_norm": 0.7580252289772034, + "learning_rate": 4.987344335324491e-05, + "loss": 1.2822, + "step": 9912 + }, + { + "epoch": 3.008954317802398, + "grad_norm": 1.1161868572235107, + "learning_rate": 4.986838108737471e-05, + "loss": 0.7667, + "step": 9913 + }, + { + "epoch": 3.0092578539990895, + "grad_norm": 1.0713251829147339, + "learning_rate": 4.986331882150451e-05, + "loss": 0.7488, + "step": 9914 + }, + { + "epoch": 3.0095613901957807, + "grad_norm": 0.5945373177528381, + "learning_rate": 4.9858256555634304e-05, + "loss": 0.6705, + "step": 9915 + }, + { + "epoch": 3.0098649263924724, + "grad_norm": 0.9917808175086975, + "learning_rate": 4.98531942897641e-05, + "loss": 0.9693, + "step": 9916 + }, + { + "epoch": 3.0101684625891636, + "grad_norm": 1.123931646347046, + "learning_rate": 4.9848132023893894e-05, + "loss": 0.9279, + "step": 9917 + }, + { + "epoch": 3.0104719987858553, + "grad_norm": 1.0795220136642456, + "learning_rate": 4.984306975802369e-05, + "loss": 0.6274, + "step": 9918 + }, + { + "epoch": 3.0107755349825465, + "grad_norm": 0.6648991703987122, + "learning_rate": 4.983800749215349e-05, + "loss": 0.7407, + "step": 9919 + }, + { + "epoch": 3.011079071179238, + "grad_norm": 0.894781768321991, + "learning_rate": 4.9832945226283285e-05, + "loss": 0.9515, + "step": 9920 + }, + { + "epoch": 3.0113826073759298, + "grad_norm": 0.6843689680099487, + "learning_rate": 4.9827882960413086e-05, + "loss": 1.2523, + "step": 9921 + }, + { + "epoch": 3.011686143572621, + "grad_norm": 0.7869067192077637, + "learning_rate": 4.982282069454288e-05, + "loss": 1.1186, + "step": 9922 + }, + { + "epoch": 3.0119896797693126, + "grad_norm": 1.1819573640823364, + "learning_rate": 4.9817758428672676e-05, + "loss": 0.4672, + "step": 9923 + }, + { + "epoch": 3.012293215966004, + "grad_norm": 1.117724895477295, + "learning_rate": 4.981269616280248e-05, + "loss": 0.4872, + "step": 9924 + }, + { + "epoch": 3.0125967521626955, + "grad_norm": 0.5742544531822205, + "learning_rate": 4.980763389693227e-05, + "loss": 0.6115, + "step": 9925 + }, + { + "epoch": 3.0129002883593867, + "grad_norm": 0.9205898642539978, + "learning_rate": 4.980257163106207e-05, + "loss": 0.6983, + "step": 9926 + }, + { + "epoch": 3.0132038245560784, + "grad_norm": 0.8232927322387695, + "learning_rate": 4.979750936519186e-05, + "loss": 0.4218, + "step": 9927 + }, + { + "epoch": 3.0135073607527696, + "grad_norm": 1.0297305583953857, + "learning_rate": 4.979244709932166e-05, + "loss": 0.7704, + "step": 9928 + }, + { + "epoch": 3.0138108969494612, + "grad_norm": 1.0752744674682617, + "learning_rate": 4.978738483345146e-05, + "loss": 0.6037, + "step": 9929 + }, + { + "epoch": 3.0141144331461525, + "grad_norm": 0.9535788297653198, + "learning_rate": 4.978232256758125e-05, + "loss": 0.8848, + "step": 9930 + }, + { + "epoch": 3.014417969342844, + "grad_norm": 0.820176899433136, + "learning_rate": 4.977726030171105e-05, + "loss": 1.0916, + "step": 9931 + }, + { + "epoch": 3.0147215055395358, + "grad_norm": 0.9483010172843933, + "learning_rate": 4.977219803584084e-05, + "loss": 0.751, + "step": 9932 + }, + { + "epoch": 3.015025041736227, + "grad_norm": 1.2106448411941528, + "learning_rate": 4.976713576997064e-05, + "loss": 0.5232, + "step": 9933 + }, + { + "epoch": 3.0153285779329186, + "grad_norm": 0.8939266204833984, + "learning_rate": 4.976207350410044e-05, + "loss": 0.9032, + "step": 9934 + }, + { + "epoch": 3.01563211412961, + "grad_norm": 1.0762056112289429, + "learning_rate": 4.9757011238230234e-05, + "loss": 0.6202, + "step": 9935 + }, + { + "epoch": 3.0159356503263015, + "grad_norm": 0.9702959656715393, + "learning_rate": 4.975194897236003e-05, + "loss": 0.6372, + "step": 9936 + }, + { + "epoch": 3.0162391865229927, + "grad_norm": 1.0524059534072876, + "learning_rate": 4.9746886706489823e-05, + "loss": 0.4538, + "step": 9937 + }, + { + "epoch": 3.0165427227196844, + "grad_norm": 1.0001167058944702, + "learning_rate": 4.9741824440619625e-05, + "loss": 0.9368, + "step": 9938 + }, + { + "epoch": 3.0168462589163756, + "grad_norm": 1.1349345445632935, + "learning_rate": 4.973676217474942e-05, + "loss": 0.8937, + "step": 9939 + }, + { + "epoch": 3.0171497951130672, + "grad_norm": 0.8107739090919495, + "learning_rate": 4.9731699908879215e-05, + "loss": 0.8347, + "step": 9940 + }, + { + "epoch": 3.017453331309759, + "grad_norm": 0.9209519624710083, + "learning_rate": 4.972663764300901e-05, + "loss": 0.8891, + "step": 9941 + }, + { + "epoch": 3.01775686750645, + "grad_norm": 0.9517861604690552, + "learning_rate": 4.9721575377138804e-05, + "loss": 0.8127, + "step": 9942 + }, + { + "epoch": 3.0180604037031418, + "grad_norm": 0.9834015369415283, + "learning_rate": 4.9716513111268606e-05, + "loss": 0.6899, + "step": 9943 + }, + { + "epoch": 3.018363939899833, + "grad_norm": 0.9329797625541687, + "learning_rate": 4.971145084539841e-05, + "loss": 0.964, + "step": 9944 + }, + { + "epoch": 3.0186674760965246, + "grad_norm": 0.721088171005249, + "learning_rate": 4.97063885795282e-05, + "loss": 0.8785, + "step": 9945 + }, + { + "epoch": 3.018971012293216, + "grad_norm": 0.9499136209487915, + "learning_rate": 4.9701326313658e-05, + "loss": 0.9782, + "step": 9946 + }, + { + "epoch": 3.0192745484899075, + "grad_norm": 0.9532542824745178, + "learning_rate": 4.969626404778779e-05, + "loss": 0.8628, + "step": 9947 + }, + { + "epoch": 3.0195780846865987, + "grad_norm": 1.153826355934143, + "learning_rate": 4.969120178191759e-05, + "loss": 0.394, + "step": 9948 + }, + { + "epoch": 3.0198816208832904, + "grad_norm": 0.9489085674285889, + "learning_rate": 4.968613951604739e-05, + "loss": 1.294, + "step": 9949 + }, + { + "epoch": 3.0201851570799816, + "grad_norm": 0.9038990139961243, + "learning_rate": 4.968107725017718e-05, + "loss": 0.6538, + "step": 9950 + }, + { + "epoch": 3.0204886932766732, + "grad_norm": 1.0667967796325684, + "learning_rate": 4.967601498430698e-05, + "loss": 0.8695, + "step": 9951 + }, + { + "epoch": 3.020792229473365, + "grad_norm": 1.0511293411254883, + "learning_rate": 4.967095271843677e-05, + "loss": 0.6799, + "step": 9952 + }, + { + "epoch": 3.021095765670056, + "grad_norm": 0.9058912396430969, + "learning_rate": 4.9665890452566574e-05, + "loss": 1.3471, + "step": 9953 + }, + { + "epoch": 3.0213993018667478, + "grad_norm": 1.0901941061019897, + "learning_rate": 4.966082818669637e-05, + "loss": 0.8364, + "step": 9954 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 1.0618418455123901, + "learning_rate": 4.9655765920826164e-05, + "loss": 0.8804, + "step": 9955 + }, + { + "epoch": 3.0220063742601306, + "grad_norm": 1.1724745035171509, + "learning_rate": 4.965070365495596e-05, + "loss": 0.8896, + "step": 9956 + }, + { + "epoch": 3.022309910456822, + "grad_norm": 0.906879186630249, + "learning_rate": 4.964564138908575e-05, + "loss": 0.5583, + "step": 9957 + }, + { + "epoch": 3.0226134466535135, + "grad_norm": 0.879097580909729, + "learning_rate": 4.9640579123215555e-05, + "loss": 1.0431, + "step": 9958 + }, + { + "epoch": 3.0229169828502047, + "grad_norm": 0.9819177389144897, + "learning_rate": 4.963551685734535e-05, + "loss": 1.1626, + "step": 9959 + }, + { + "epoch": 3.0232205190468964, + "grad_norm": 1.0365228652954102, + "learning_rate": 4.9630454591475144e-05, + "loss": 0.8894, + "step": 9960 + }, + { + "epoch": 3.0235240552435876, + "grad_norm": 0.7370893955230713, + "learning_rate": 4.962539232560494e-05, + "loss": 0.7873, + "step": 9961 + }, + { + "epoch": 3.0238275914402792, + "grad_norm": 0.9230274558067322, + "learning_rate": 4.962033005973474e-05, + "loss": 0.3545, + "step": 9962 + }, + { + "epoch": 3.024131127636971, + "grad_norm": 1.0262789726257324, + "learning_rate": 4.9615267793864536e-05, + "loss": 0.7352, + "step": 9963 + }, + { + "epoch": 3.024434663833662, + "grad_norm": 1.1875581741333008, + "learning_rate": 4.961020552799433e-05, + "loss": 0.2989, + "step": 9964 + }, + { + "epoch": 3.0247382000303538, + "grad_norm": 0.7860487103462219, + "learning_rate": 4.960514326212413e-05, + "loss": 0.7674, + "step": 9965 + }, + { + "epoch": 3.025041736227045, + "grad_norm": 1.1991053819656372, + "learning_rate": 4.960008099625393e-05, + "loss": 1.179, + "step": 9966 + }, + { + "epoch": 3.0253452724237366, + "grad_norm": 1.3125637769699097, + "learning_rate": 4.959501873038372e-05, + "loss": 0.5942, + "step": 9967 + }, + { + "epoch": 3.025648808620428, + "grad_norm": 0.9002748727798462, + "learning_rate": 4.958995646451352e-05, + "loss": 0.8309, + "step": 9968 + }, + { + "epoch": 3.0259523448171195, + "grad_norm": 0.7663509249687195, + "learning_rate": 4.958489419864332e-05, + "loss": 1.3243, + "step": 9969 + }, + { + "epoch": 3.0262558810138107, + "grad_norm": 0.9954874515533447, + "learning_rate": 4.957983193277311e-05, + "loss": 0.4026, + "step": 9970 + }, + { + "epoch": 3.0265594172105024, + "grad_norm": 1.1502779722213745, + "learning_rate": 4.957476966690291e-05, + "loss": 0.729, + "step": 9971 + }, + { + "epoch": 3.026862953407194, + "grad_norm": 0.9466817378997803, + "learning_rate": 4.95697074010327e-05, + "loss": 0.7324, + "step": 9972 + }, + { + "epoch": 3.0271664896038852, + "grad_norm": 0.8472762703895569, + "learning_rate": 4.9564645135162504e-05, + "loss": 1.0931, + "step": 9973 + }, + { + "epoch": 3.027470025800577, + "grad_norm": 0.7894196510314941, + "learning_rate": 4.95595828692923e-05, + "loss": 1.1567, + "step": 9974 + }, + { + "epoch": 3.027773561997268, + "grad_norm": 0.891975462436676, + "learning_rate": 4.9554520603422093e-05, + "loss": 0.7622, + "step": 9975 + }, + { + "epoch": 3.0280770981939598, + "grad_norm": 0.9115039706230164, + "learning_rate": 4.954945833755189e-05, + "loss": 0.7939, + "step": 9976 + }, + { + "epoch": 3.028380634390651, + "grad_norm": 1.35649836063385, + "learning_rate": 4.954439607168169e-05, + "loss": 0.7654, + "step": 9977 + }, + { + "epoch": 3.0286841705873426, + "grad_norm": 1.0273194313049316, + "learning_rate": 4.9539333805811485e-05, + "loss": 0.7395, + "step": 9978 + }, + { + "epoch": 3.028987706784034, + "grad_norm": 0.8362645506858826, + "learning_rate": 4.953427153994128e-05, + "loss": 0.9669, + "step": 9979 + }, + { + "epoch": 3.0292912429807255, + "grad_norm": 1.4432626962661743, + "learning_rate": 4.9529209274071074e-05, + "loss": 0.8575, + "step": 9980 + }, + { + "epoch": 3.0295947791774167, + "grad_norm": 0.9342387914657593, + "learning_rate": 4.952414700820087e-05, + "loss": 0.9885, + "step": 9981 + }, + { + "epoch": 3.0298983153741084, + "grad_norm": 1.1419669389724731, + "learning_rate": 4.951908474233067e-05, + "loss": 1.0959, + "step": 9982 + }, + { + "epoch": 3.0302018515708, + "grad_norm": 0.9934976100921631, + "learning_rate": 4.9514022476460465e-05, + "loss": 0.5081, + "step": 9983 + }, + { + "epoch": 3.0305053877674912, + "grad_norm": 1.2122128009796143, + "learning_rate": 4.950896021059026e-05, + "loss": 0.9086, + "step": 9984 + }, + { + "epoch": 3.030808923964183, + "grad_norm": 0.9121325612068176, + "learning_rate": 4.9503897944720055e-05, + "loss": 1.1324, + "step": 9985 + }, + { + "epoch": 3.031112460160874, + "grad_norm": 0.9659928679466248, + "learning_rate": 4.949883567884985e-05, + "loss": 1.1597, + "step": 9986 + }, + { + "epoch": 3.0314159963575658, + "grad_norm": 1.1454473733901978, + "learning_rate": 4.949377341297965e-05, + "loss": 0.8592, + "step": 9987 + }, + { + "epoch": 3.031719532554257, + "grad_norm": 1.1099263429641724, + "learning_rate": 4.948871114710945e-05, + "loss": 0.4484, + "step": 9988 + }, + { + "epoch": 3.0320230687509486, + "grad_norm": 0.9389984607696533, + "learning_rate": 4.948364888123925e-05, + "loss": 0.7857, + "step": 9989 + }, + { + "epoch": 3.03232660494764, + "grad_norm": 1.0837533473968506, + "learning_rate": 4.947858661536904e-05, + "loss": 0.5011, + "step": 9990 + }, + { + "epoch": 3.0326301411443315, + "grad_norm": 0.8904072642326355, + "learning_rate": 4.947352434949884e-05, + "loss": 1.2047, + "step": 9991 + }, + { + "epoch": 3.0329336773410227, + "grad_norm": 1.046789288520813, + "learning_rate": 4.946846208362864e-05, + "loss": 0.9434, + "step": 9992 + }, + { + "epoch": 3.0332372135377144, + "grad_norm": 0.8978736996650696, + "learning_rate": 4.9463399817758434e-05, + "loss": 0.3767, + "step": 9993 + }, + { + "epoch": 3.033540749734406, + "grad_norm": 1.0865288972854614, + "learning_rate": 4.945833755188823e-05, + "loss": 0.7399, + "step": 9994 + }, + { + "epoch": 3.0338442859310972, + "grad_norm": 1.0696719884872437, + "learning_rate": 4.945327528601802e-05, + "loss": 0.9226, + "step": 9995 + }, + { + "epoch": 3.034147822127789, + "grad_norm": 0.9205285310745239, + "learning_rate": 4.944821302014782e-05, + "loss": 0.7815, + "step": 9996 + }, + { + "epoch": 3.03445135832448, + "grad_norm": 1.0227653980255127, + "learning_rate": 4.944315075427762e-05, + "loss": 0.8217, + "step": 9997 + }, + { + "epoch": 3.0347548945211718, + "grad_norm": 0.8310995101928711, + "learning_rate": 4.9438088488407414e-05, + "loss": 0.6755, + "step": 9998 + }, + { + "epoch": 3.035058430717863, + "grad_norm": 1.1098823547363281, + "learning_rate": 4.943302622253721e-05, + "loss": 0.7325, + "step": 9999 + }, + { + "epoch": 3.0353619669145546, + "grad_norm": 1.0347144603729248, + "learning_rate": 4.9427963956667004e-05, + "loss": 1.0423, + "step": 10000 + }, + { + "epoch": 3.035665503111246, + "grad_norm": 0.8058704733848572, + "learning_rate": 4.9422901690796805e-05, + "loss": 0.8933, + "step": 10001 + }, + { + "epoch": 3.0359690393079375, + "grad_norm": 0.9392812848091125, + "learning_rate": 4.94178394249266e-05, + "loss": 0.7874, + "step": 10002 + }, + { + "epoch": 3.036272575504629, + "grad_norm": 0.8939214944839478, + "learning_rate": 4.9412777159056395e-05, + "loss": 1.3727, + "step": 10003 + }, + { + "epoch": 3.0365761117013204, + "grad_norm": 1.172133207321167, + "learning_rate": 4.940771489318619e-05, + "loss": 0.8838, + "step": 10004 + }, + { + "epoch": 3.036879647898012, + "grad_norm": 0.9525058269500732, + "learning_rate": 4.9402652627315985e-05, + "loss": 1.0387, + "step": 10005 + }, + { + "epoch": 3.0371831840947032, + "grad_norm": 0.9174591302871704, + "learning_rate": 4.9397590361445786e-05, + "loss": 0.9717, + "step": 10006 + }, + { + "epoch": 3.037486720291395, + "grad_norm": 1.0333763360977173, + "learning_rate": 4.939252809557558e-05, + "loss": 1.1213, + "step": 10007 + }, + { + "epoch": 3.037790256488086, + "grad_norm": 0.9586665034294128, + "learning_rate": 4.9387465829705376e-05, + "loss": 0.8477, + "step": 10008 + }, + { + "epoch": 3.0380937926847777, + "grad_norm": 1.0275641679763794, + "learning_rate": 4.938240356383517e-05, + "loss": 0.9455, + "step": 10009 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 1.0820685625076294, + "learning_rate": 4.937734129796497e-05, + "loss": 0.9489, + "step": 10010 + }, + { + "epoch": 3.0387008650781606, + "grad_norm": 0.7282118797302246, + "learning_rate": 4.937227903209477e-05, + "loss": 1.0303, + "step": 10011 + }, + { + "epoch": 3.039004401274852, + "grad_norm": 0.8379800319671631, + "learning_rate": 4.936721676622457e-05, + "loss": 0.3675, + "step": 10012 + }, + { + "epoch": 3.0393079374715435, + "grad_norm": 0.8979037404060364, + "learning_rate": 4.936215450035436e-05, + "loss": 0.5392, + "step": 10013 + }, + { + "epoch": 3.039611473668235, + "grad_norm": 1.0318119525909424, + "learning_rate": 4.935709223448416e-05, + "loss": 1.1578, + "step": 10014 + }, + { + "epoch": 3.0399150098649264, + "grad_norm": 0.9774758815765381, + "learning_rate": 4.935202996861395e-05, + "loss": 1.1318, + "step": 10015 + }, + { + "epoch": 3.040218546061618, + "grad_norm": 0.9226740002632141, + "learning_rate": 4.9346967702743755e-05, + "loss": 0.9116, + "step": 10016 + }, + { + "epoch": 3.0405220822583092, + "grad_norm": 0.956209659576416, + "learning_rate": 4.934190543687355e-05, + "loss": 0.728, + "step": 10017 + }, + { + "epoch": 3.040825618455001, + "grad_norm": 0.6110402941703796, + "learning_rate": 4.9336843171003344e-05, + "loss": 0.7283, + "step": 10018 + }, + { + "epoch": 3.041129154651692, + "grad_norm": 0.9066857099533081, + "learning_rate": 4.933178090513314e-05, + "loss": 0.4915, + "step": 10019 + }, + { + "epoch": 3.0414326908483837, + "grad_norm": 1.0756696462631226, + "learning_rate": 4.9326718639262934e-05, + "loss": 0.7464, + "step": 10020 + }, + { + "epoch": 3.041736227045075, + "grad_norm": 0.8810809254646301, + "learning_rate": 4.9321656373392735e-05, + "loss": 0.7272, + "step": 10021 + }, + { + "epoch": 3.0420397632417666, + "grad_norm": 1.0554684400558472, + "learning_rate": 4.931659410752253e-05, + "loss": 1.1886, + "step": 10022 + }, + { + "epoch": 3.042343299438458, + "grad_norm": 1.0114916563034058, + "learning_rate": 4.9311531841652325e-05, + "loss": 1.1831, + "step": 10023 + }, + { + "epoch": 3.0426468356351495, + "grad_norm": 0.9716253876686096, + "learning_rate": 4.930646957578212e-05, + "loss": 1.009, + "step": 10024 + }, + { + "epoch": 3.042950371831841, + "grad_norm": 1.1282830238342285, + "learning_rate": 4.9301407309911914e-05, + "loss": 0.5271, + "step": 10025 + }, + { + "epoch": 3.0432539080285324, + "grad_norm": 1.0426957607269287, + "learning_rate": 4.9296345044041716e-05, + "loss": 0.4032, + "step": 10026 + }, + { + "epoch": 3.043557444225224, + "grad_norm": 1.0833714008331299, + "learning_rate": 4.929128277817151e-05, + "loss": 1.1721, + "step": 10027 + }, + { + "epoch": 3.043860980421915, + "grad_norm": 0.9141514897346497, + "learning_rate": 4.9286220512301306e-05, + "loss": 0.6118, + "step": 10028 + }, + { + "epoch": 3.044164516618607, + "grad_norm": 0.9531258940696716, + "learning_rate": 4.92811582464311e-05, + "loss": 0.8177, + "step": 10029 + }, + { + "epoch": 3.044468052815298, + "grad_norm": 0.6411280632019043, + "learning_rate": 4.92760959805609e-05, + "loss": 0.6036, + "step": 10030 + }, + { + "epoch": 3.0447715890119897, + "grad_norm": 0.9066987037658691, + "learning_rate": 4.92710337146907e-05, + "loss": 0.8623, + "step": 10031 + }, + { + "epoch": 3.045075125208681, + "grad_norm": 1.0410274267196655, + "learning_rate": 4.92659714488205e-05, + "loss": 0.9437, + "step": 10032 + }, + { + "epoch": 3.0453786614053726, + "grad_norm": 0.8047177791595459, + "learning_rate": 4.926090918295029e-05, + "loss": 1.1362, + "step": 10033 + }, + { + "epoch": 3.045682197602064, + "grad_norm": 0.870140552520752, + "learning_rate": 4.925584691708009e-05, + "loss": 0.8955, + "step": 10034 + }, + { + "epoch": 3.0459857337987555, + "grad_norm": 0.9207876920700073, + "learning_rate": 4.925078465120988e-05, + "loss": 0.2538, + "step": 10035 + }, + { + "epoch": 3.046289269995447, + "grad_norm": 0.7734060883522034, + "learning_rate": 4.9245722385339684e-05, + "loss": 0.9932, + "step": 10036 + }, + { + "epoch": 3.0465928061921383, + "grad_norm": 0.9379348158836365, + "learning_rate": 4.924066011946948e-05, + "loss": 0.7212, + "step": 10037 + }, + { + "epoch": 3.04689634238883, + "grad_norm": 1.1071947813034058, + "learning_rate": 4.9235597853599274e-05, + "loss": 0.7786, + "step": 10038 + }, + { + "epoch": 3.047199878585521, + "grad_norm": 0.9315916299819946, + "learning_rate": 4.923053558772907e-05, + "loss": 0.8557, + "step": 10039 + }, + { + "epoch": 3.047503414782213, + "grad_norm": 0.983763575553894, + "learning_rate": 4.922547332185887e-05, + "loss": 1.2008, + "step": 10040 + }, + { + "epoch": 3.047806950978904, + "grad_norm": 0.7589134573936462, + "learning_rate": 4.9220411055988665e-05, + "loss": 0.5416, + "step": 10041 + }, + { + "epoch": 3.0481104871755957, + "grad_norm": 0.9792746901512146, + "learning_rate": 4.921534879011846e-05, + "loss": 0.8329, + "step": 10042 + }, + { + "epoch": 3.048414023372287, + "grad_norm": 0.6817960143089294, + "learning_rate": 4.9210286524248255e-05, + "loss": 0.8159, + "step": 10043 + }, + { + "epoch": 3.0487175595689786, + "grad_norm": 0.799160897731781, + "learning_rate": 4.920522425837805e-05, + "loss": 1.1025, + "step": 10044 + }, + { + "epoch": 3.0490210957656703, + "grad_norm": 0.9413865804672241, + "learning_rate": 4.920016199250785e-05, + "loss": 0.7961, + "step": 10045 + }, + { + "epoch": 3.0493246319623615, + "grad_norm": 0.7982587814331055, + "learning_rate": 4.9195099726637646e-05, + "loss": 1.3936, + "step": 10046 + }, + { + "epoch": 3.049628168159053, + "grad_norm": 1.055144190788269, + "learning_rate": 4.919003746076744e-05, + "loss": 0.4997, + "step": 10047 + }, + { + "epoch": 3.0499317043557443, + "grad_norm": 2.2863147258758545, + "learning_rate": 4.9184975194897235e-05, + "loss": 1.7582, + "step": 10048 + }, + { + "epoch": 3.050235240552436, + "grad_norm": 0.8687443733215332, + "learning_rate": 4.917991292902703e-05, + "loss": 0.9172, + "step": 10049 + }, + { + "epoch": 3.050538776749127, + "grad_norm": 1.1977264881134033, + "learning_rate": 4.917485066315683e-05, + "loss": 0.8445, + "step": 10050 + }, + { + "epoch": 3.050842312945819, + "grad_norm": 0.8052032589912415, + "learning_rate": 4.9169788397286627e-05, + "loss": 0.5158, + "step": 10051 + }, + { + "epoch": 3.05114584914251, + "grad_norm": 1.1818726062774658, + "learning_rate": 4.916472613141642e-05, + "loss": 0.9488, + "step": 10052 + }, + { + "epoch": 3.0514493853392017, + "grad_norm": 0.9007443785667419, + "learning_rate": 4.9159663865546216e-05, + "loss": 1.0131, + "step": 10053 + }, + { + "epoch": 3.051752921535893, + "grad_norm": 0.8898486495018005, + "learning_rate": 4.915460159967602e-05, + "loss": 0.9711, + "step": 10054 + }, + { + "epoch": 3.0520564577325846, + "grad_norm": 0.9535994529724121, + "learning_rate": 4.914953933380582e-05, + "loss": 0.5823, + "step": 10055 + }, + { + "epoch": 3.0523599939292763, + "grad_norm": 1.3218342065811157, + "learning_rate": 4.9144477067935614e-05, + "loss": 0.8556, + "step": 10056 + }, + { + "epoch": 3.0526635301259675, + "grad_norm": 0.7276322245597839, + "learning_rate": 4.913941480206541e-05, + "loss": 0.7105, + "step": 10057 + }, + { + "epoch": 3.052967066322659, + "grad_norm": 0.8894461989402771, + "learning_rate": 4.9134352536195204e-05, + "loss": 0.9246, + "step": 10058 + }, + { + "epoch": 3.0532706025193503, + "grad_norm": 0.9899723529815674, + "learning_rate": 4.9129290270325e-05, + "loss": 0.8808, + "step": 10059 + }, + { + "epoch": 3.053574138716042, + "grad_norm": 1.315685749053955, + "learning_rate": 4.91242280044548e-05, + "loss": 0.7734, + "step": 10060 + }, + { + "epoch": 3.053877674912733, + "grad_norm": 0.9604356288909912, + "learning_rate": 4.9119165738584595e-05, + "loss": 0.7402, + "step": 10061 + }, + { + "epoch": 3.054181211109425, + "grad_norm": 1.1020888090133667, + "learning_rate": 4.911410347271439e-05, + "loss": 1.0025, + "step": 10062 + }, + { + "epoch": 3.054484747306116, + "grad_norm": 1.0519235134124756, + "learning_rate": 4.9109041206844184e-05, + "loss": 0.6515, + "step": 10063 + }, + { + "epoch": 3.0547882835028077, + "grad_norm": 1.1188275814056396, + "learning_rate": 4.910397894097398e-05, + "loss": 1.041, + "step": 10064 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 1.0462764501571655, + "learning_rate": 4.909891667510378e-05, + "loss": 0.9353, + "step": 10065 + }, + { + "epoch": 3.0553953558961906, + "grad_norm": 1.1422230005264282, + "learning_rate": 4.9093854409233576e-05, + "loss": 0.6631, + "step": 10066 + }, + { + "epoch": 3.0556988920928823, + "grad_norm": 0.9186900854110718, + "learning_rate": 4.908879214336337e-05, + "loss": 1.4445, + "step": 10067 + }, + { + "epoch": 3.0560024282895735, + "grad_norm": 0.961794912815094, + "learning_rate": 4.9083729877493165e-05, + "loss": 0.7322, + "step": 10068 + }, + { + "epoch": 3.056305964486265, + "grad_norm": 0.7267061471939087, + "learning_rate": 4.907866761162297e-05, + "loss": 1.0208, + "step": 10069 + }, + { + "epoch": 3.0566095006829563, + "grad_norm": 0.7841180562973022, + "learning_rate": 4.907360534575276e-05, + "loss": 1.2743, + "step": 10070 + }, + { + "epoch": 3.056913036879648, + "grad_norm": 1.1358261108398438, + "learning_rate": 4.9068543079882556e-05, + "loss": 0.7434, + "step": 10071 + }, + { + "epoch": 3.057216573076339, + "grad_norm": 1.2675440311431885, + "learning_rate": 4.906348081401235e-05, + "loss": 0.8334, + "step": 10072 + }, + { + "epoch": 3.057520109273031, + "grad_norm": 0.7764325737953186, + "learning_rate": 4.9058418548142146e-05, + "loss": 0.9979, + "step": 10073 + }, + { + "epoch": 3.057823645469722, + "grad_norm": 0.8188411593437195, + "learning_rate": 4.905335628227195e-05, + "loss": 1.3156, + "step": 10074 + }, + { + "epoch": 3.0581271816664137, + "grad_norm": 1.1691958904266357, + "learning_rate": 4.904829401640174e-05, + "loss": 0.7075, + "step": 10075 + }, + { + "epoch": 3.0584307178631054, + "grad_norm": 1.2156111001968384, + "learning_rate": 4.904323175053154e-05, + "loss": 0.7834, + "step": 10076 + }, + { + "epoch": 3.0587342540597966, + "grad_norm": 0.6765130162239075, + "learning_rate": 4.903816948466134e-05, + "loss": 0.6806, + "step": 10077 + }, + { + "epoch": 3.0590377902564883, + "grad_norm": 1.0758798122406006, + "learning_rate": 4.9033107218791133e-05, + "loss": 0.9363, + "step": 10078 + }, + { + "epoch": 3.0593413264531795, + "grad_norm": 0.8120879530906677, + "learning_rate": 4.9028044952920935e-05, + "loss": 0.5809, + "step": 10079 + }, + { + "epoch": 3.059644862649871, + "grad_norm": 0.819503664970398, + "learning_rate": 4.902298268705073e-05, + "loss": 1.7204, + "step": 10080 + }, + { + "epoch": 3.0599483988465623, + "grad_norm": 1.0758404731750488, + "learning_rate": 4.9017920421180525e-05, + "loss": 1.0819, + "step": 10081 + }, + { + "epoch": 3.060251935043254, + "grad_norm": 1.1419119834899902, + "learning_rate": 4.901285815531032e-05, + "loss": 0.8049, + "step": 10082 + }, + { + "epoch": 3.060555471239945, + "grad_norm": 1.0506309270858765, + "learning_rate": 4.9007795889440114e-05, + "loss": 1.0661, + "step": 10083 + }, + { + "epoch": 3.060859007436637, + "grad_norm": 0.8997217416763306, + "learning_rate": 4.9002733623569916e-05, + "loss": 1.2288, + "step": 10084 + }, + { + "epoch": 3.061162543633328, + "grad_norm": 1.093537449836731, + "learning_rate": 4.899767135769971e-05, + "loss": 1.11, + "step": 10085 + }, + { + "epoch": 3.0614660798300197, + "grad_norm": 0.8891075849533081, + "learning_rate": 4.8992609091829505e-05, + "loss": 1.1641, + "step": 10086 + }, + { + "epoch": 3.0617696160267114, + "grad_norm": 1.0170884132385254, + "learning_rate": 4.89875468259593e-05, + "loss": 0.7591, + "step": 10087 + }, + { + "epoch": 3.0620731522234026, + "grad_norm": 0.8580564856529236, + "learning_rate": 4.8982484560089095e-05, + "loss": 0.944, + "step": 10088 + }, + { + "epoch": 3.0623766884200943, + "grad_norm": 1.0548725128173828, + "learning_rate": 4.8977422294218896e-05, + "loss": 0.7254, + "step": 10089 + }, + { + "epoch": 3.0626802246167855, + "grad_norm": 1.0889960527420044, + "learning_rate": 4.897236002834869e-05, + "loss": 1.0936, + "step": 10090 + }, + { + "epoch": 3.062983760813477, + "grad_norm": 0.8464717864990234, + "learning_rate": 4.8967297762478486e-05, + "loss": 1.1986, + "step": 10091 + }, + { + "epoch": 3.0632872970101683, + "grad_norm": 1.111967921257019, + "learning_rate": 4.896223549660828e-05, + "loss": 0.5752, + "step": 10092 + }, + { + "epoch": 3.06359083320686, + "grad_norm": 1.176175832748413, + "learning_rate": 4.895717323073808e-05, + "loss": 0.8448, + "step": 10093 + }, + { + "epoch": 3.063894369403551, + "grad_norm": 1.0573111772537231, + "learning_rate": 4.895211096486788e-05, + "loss": 0.9868, + "step": 10094 + }, + { + "epoch": 3.064197905600243, + "grad_norm": 1.0002180337905884, + "learning_rate": 4.894704869899767e-05, + "loss": 1.2311, + "step": 10095 + }, + { + "epoch": 3.064501441796934, + "grad_norm": 0.8811676502227783, + "learning_rate": 4.894198643312747e-05, + "loss": 0.8717, + "step": 10096 + }, + { + "epoch": 3.0648049779936257, + "grad_norm": 0.8967773914337158, + "learning_rate": 4.893692416725726e-05, + "loss": 1.1097, + "step": 10097 + }, + { + "epoch": 3.0651085141903174, + "grad_norm": 1.1144475936889648, + "learning_rate": 4.893186190138706e-05, + "loss": 1.0551, + "step": 10098 + }, + { + "epoch": 3.0654120503870086, + "grad_norm": 0.8101746439933777, + "learning_rate": 4.8926799635516865e-05, + "loss": 0.7119, + "step": 10099 + }, + { + "epoch": 3.0657155865837002, + "grad_norm": 1.0638407468795776, + "learning_rate": 4.892173736964666e-05, + "loss": 0.6897, + "step": 10100 + }, + { + "epoch": 3.0660191227803915, + "grad_norm": 0.5819783210754395, + "learning_rate": 4.8916675103776454e-05, + "loss": 0.6872, + "step": 10101 + }, + { + "epoch": 3.066322658977083, + "grad_norm": 0.7732778191566467, + "learning_rate": 4.891161283790625e-05, + "loss": 1.4585, + "step": 10102 + }, + { + "epoch": 3.0666261951737743, + "grad_norm": 1.2580299377441406, + "learning_rate": 4.8906550572036044e-05, + "loss": 1.0117, + "step": 10103 + }, + { + "epoch": 3.066929731370466, + "grad_norm": 0.9836311340332031, + "learning_rate": 4.8901488306165846e-05, + "loss": 1.2415, + "step": 10104 + }, + { + "epoch": 3.067233267567157, + "grad_norm": 1.0615112781524658, + "learning_rate": 4.889642604029564e-05, + "loss": 1.2559, + "step": 10105 + }, + { + "epoch": 3.067536803763849, + "grad_norm": 0.9382474422454834, + "learning_rate": 4.8891363774425435e-05, + "loss": 0.8988, + "step": 10106 + }, + { + "epoch": 3.0678403399605405, + "grad_norm": 0.8555431365966797, + "learning_rate": 4.888630150855523e-05, + "loss": 1.0088, + "step": 10107 + }, + { + "epoch": 3.0681438761572317, + "grad_norm": 0.7999470233917236, + "learning_rate": 4.888123924268503e-05, + "loss": 1.3693, + "step": 10108 + }, + { + "epoch": 3.0684474123539234, + "grad_norm": 1.0631412267684937, + "learning_rate": 4.8876176976814826e-05, + "loss": 0.6953, + "step": 10109 + }, + { + "epoch": 3.0687509485506146, + "grad_norm": 0.8676734566688538, + "learning_rate": 4.887111471094462e-05, + "loss": 1.0611, + "step": 10110 + }, + { + "epoch": 3.0690544847473062, + "grad_norm": 1.0192352533340454, + "learning_rate": 4.8866052445074416e-05, + "loss": 0.55, + "step": 10111 + }, + { + "epoch": 3.0693580209439975, + "grad_norm": 1.1791293621063232, + "learning_rate": 4.886099017920421e-05, + "loss": 1.0875, + "step": 10112 + }, + { + "epoch": 3.069661557140689, + "grad_norm": 1.0366480350494385, + "learning_rate": 4.885592791333401e-05, + "loss": 1.1656, + "step": 10113 + }, + { + "epoch": 3.0699650933373803, + "grad_norm": 0.650903582572937, + "learning_rate": 4.885086564746381e-05, + "loss": 0.592, + "step": 10114 + }, + { + "epoch": 3.070268629534072, + "grad_norm": 1.1978451013565063, + "learning_rate": 4.88458033815936e-05, + "loss": 1.031, + "step": 10115 + }, + { + "epoch": 3.070572165730763, + "grad_norm": 1.03373384475708, + "learning_rate": 4.8840741115723397e-05, + "loss": 0.561, + "step": 10116 + }, + { + "epoch": 3.070875701927455, + "grad_norm": 1.226913571357727, + "learning_rate": 4.883567884985319e-05, + "loss": 0.8524, + "step": 10117 + }, + { + "epoch": 3.0711792381241465, + "grad_norm": 0.4368791878223419, + "learning_rate": 4.883061658398299e-05, + "loss": 0.8886, + "step": 10118 + }, + { + "epoch": 3.0714827743208377, + "grad_norm": 0.9825581908226013, + "learning_rate": 4.882555431811279e-05, + "loss": 0.8351, + "step": 10119 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.9371309876441956, + "learning_rate": 4.882049205224258e-05, + "loss": 0.8427, + "step": 10120 + }, + { + "epoch": 3.0720898467142206, + "grad_norm": 0.9688662886619568, + "learning_rate": 4.8815429786372384e-05, + "loss": 0.8709, + "step": 10121 + }, + { + "epoch": 3.0723933829109122, + "grad_norm": 1.0375014543533325, + "learning_rate": 4.881036752050218e-05, + "loss": 1.0278, + "step": 10122 + }, + { + "epoch": 3.0726969191076035, + "grad_norm": 0.9171379804611206, + "learning_rate": 4.880530525463198e-05, + "loss": 0.834, + "step": 10123 + }, + { + "epoch": 3.073000455304295, + "grad_norm": 0.9114576578140259, + "learning_rate": 4.8800242988761775e-05, + "loss": 1.2445, + "step": 10124 + }, + { + "epoch": 3.0733039915009863, + "grad_norm": 1.1542693376541138, + "learning_rate": 4.879518072289157e-05, + "loss": 1.0519, + "step": 10125 + }, + { + "epoch": 3.073607527697678, + "grad_norm": 0.939974308013916, + "learning_rate": 4.8790118457021365e-05, + "loss": 1.3, + "step": 10126 + }, + { + "epoch": 3.073911063894369, + "grad_norm": 0.9361798167228699, + "learning_rate": 4.878505619115116e-05, + "loss": 0.7195, + "step": 10127 + }, + { + "epoch": 3.074214600091061, + "grad_norm": 1.0085220336914062, + "learning_rate": 4.877999392528096e-05, + "loss": 0.8656, + "step": 10128 + }, + { + "epoch": 3.0745181362877525, + "grad_norm": 1.0506658554077148, + "learning_rate": 4.8774931659410756e-05, + "loss": 1.0325, + "step": 10129 + }, + { + "epoch": 3.0748216724844437, + "grad_norm": 0.9073367714881897, + "learning_rate": 4.876986939354055e-05, + "loss": 1.2902, + "step": 10130 + }, + { + "epoch": 3.0751252086811354, + "grad_norm": 1.016314148902893, + "learning_rate": 4.8764807127670346e-05, + "loss": 0.8455, + "step": 10131 + }, + { + "epoch": 3.0754287448778266, + "grad_norm": 0.9842737317085266, + "learning_rate": 4.875974486180015e-05, + "loss": 1.0372, + "step": 10132 + }, + { + "epoch": 3.0757322810745182, + "grad_norm": 0.8412840962409973, + "learning_rate": 4.875468259592994e-05, + "loss": 0.6934, + "step": 10133 + }, + { + "epoch": 3.0760358172712094, + "grad_norm": 0.969932496547699, + "learning_rate": 4.874962033005974e-05, + "loss": 0.7008, + "step": 10134 + }, + { + "epoch": 3.076339353467901, + "grad_norm": 1.0085067749023438, + "learning_rate": 4.874455806418953e-05, + "loss": 0.3996, + "step": 10135 + }, + { + "epoch": 3.0766428896645923, + "grad_norm": 0.949174165725708, + "learning_rate": 4.8739495798319326e-05, + "loss": 0.7778, + "step": 10136 + }, + { + "epoch": 3.076946425861284, + "grad_norm": 1.015836238861084, + "learning_rate": 4.873443353244913e-05, + "loss": 0.3643, + "step": 10137 + }, + { + "epoch": 3.0772499620579756, + "grad_norm": 1.2225645780563354, + "learning_rate": 4.872937126657892e-05, + "loss": 0.6477, + "step": 10138 + }, + { + "epoch": 3.077553498254667, + "grad_norm": 0.7706165909767151, + "learning_rate": 4.872430900070872e-05, + "loss": 0.7601, + "step": 10139 + }, + { + "epoch": 3.0778570344513585, + "grad_norm": 0.8568792343139648, + "learning_rate": 4.871924673483851e-05, + "loss": 1.1927, + "step": 10140 + }, + { + "epoch": 3.0781605706480497, + "grad_norm": 1.1199562549591064, + "learning_rate": 4.871418446896831e-05, + "loss": 1.1007, + "step": 10141 + }, + { + "epoch": 3.0784641068447414, + "grad_norm": 0.8747684955596924, + "learning_rate": 4.870912220309811e-05, + "loss": 0.7614, + "step": 10142 + }, + { + "epoch": 3.0787676430414326, + "grad_norm": 1.061916708946228, + "learning_rate": 4.870405993722791e-05, + "loss": 0.3167, + "step": 10143 + }, + { + "epoch": 3.0790711792381242, + "grad_norm": 1.4072226285934448, + "learning_rate": 4.8698997671357705e-05, + "loss": 0.9254, + "step": 10144 + }, + { + "epoch": 3.0793747154348154, + "grad_norm": 0.8890615701675415, + "learning_rate": 4.86939354054875e-05, + "loss": 1.3558, + "step": 10145 + }, + { + "epoch": 3.079678251631507, + "grad_norm": 1.1148350238800049, + "learning_rate": 4.8688873139617295e-05, + "loss": 0.4573, + "step": 10146 + }, + { + "epoch": 3.0799817878281983, + "grad_norm": 0.5135939717292786, + "learning_rate": 4.8683810873747096e-05, + "loss": 0.9231, + "step": 10147 + }, + { + "epoch": 3.08028532402489, + "grad_norm": 1.1517614126205444, + "learning_rate": 4.867874860787689e-05, + "loss": 0.663, + "step": 10148 + }, + { + "epoch": 3.0805888602215816, + "grad_norm": 0.9306285381317139, + "learning_rate": 4.8673686342006686e-05, + "loss": 0.7039, + "step": 10149 + }, + { + "epoch": 3.080892396418273, + "grad_norm": 0.9176190495491028, + "learning_rate": 4.866862407613648e-05, + "loss": 1.1242, + "step": 10150 + }, + { + "epoch": 3.0811959326149645, + "grad_norm": 0.9981781840324402, + "learning_rate": 4.8663561810266275e-05, + "loss": 0.5939, + "step": 10151 + }, + { + "epoch": 3.0814994688116557, + "grad_norm": 0.8128316402435303, + "learning_rate": 4.865849954439608e-05, + "loss": 0.4114, + "step": 10152 + }, + { + "epoch": 3.0818030050083474, + "grad_norm": 1.1425631046295166, + "learning_rate": 4.865343727852587e-05, + "loss": 0.6942, + "step": 10153 + }, + { + "epoch": 3.0821065412050386, + "grad_norm": 0.9615387916564941, + "learning_rate": 4.8648375012655667e-05, + "loss": 1.0414, + "step": 10154 + }, + { + "epoch": 3.0824100774017302, + "grad_norm": 1.1574647426605225, + "learning_rate": 4.864331274678546e-05, + "loss": 1.1407, + "step": 10155 + }, + { + "epoch": 3.0827136135984214, + "grad_norm": 1.0242623090744019, + "learning_rate": 4.8638250480915256e-05, + "loss": 0.9817, + "step": 10156 + }, + { + "epoch": 3.083017149795113, + "grad_norm": 0.9051864147186279, + "learning_rate": 4.863318821504506e-05, + "loss": 0.6063, + "step": 10157 + }, + { + "epoch": 3.0833206859918043, + "grad_norm": 0.9410679340362549, + "learning_rate": 4.862812594917485e-05, + "loss": 1.2234, + "step": 10158 + }, + { + "epoch": 3.083624222188496, + "grad_norm": 1.2388213872909546, + "learning_rate": 4.862306368330465e-05, + "loss": 0.6131, + "step": 10159 + }, + { + "epoch": 3.0839277583851876, + "grad_norm": 0.7836751341819763, + "learning_rate": 4.861800141743444e-05, + "loss": 0.7826, + "step": 10160 + }, + { + "epoch": 3.084231294581879, + "grad_norm": 1.0143697261810303, + "learning_rate": 4.8612939151564244e-05, + "loss": 0.7356, + "step": 10161 + }, + { + "epoch": 3.0845348307785705, + "grad_norm": 1.0377453565597534, + "learning_rate": 4.860787688569404e-05, + "loss": 1.1306, + "step": 10162 + }, + { + "epoch": 3.0848383669752617, + "grad_norm": 0.7361028790473938, + "learning_rate": 4.860281461982383e-05, + "loss": 1.3143, + "step": 10163 + }, + { + "epoch": 3.0851419031719534, + "grad_norm": 0.9727846384048462, + "learning_rate": 4.859775235395363e-05, + "loss": 1.0731, + "step": 10164 + }, + { + "epoch": 3.0854454393686446, + "grad_norm": 0.9929563403129578, + "learning_rate": 4.859269008808342e-05, + "loss": 0.8171, + "step": 10165 + }, + { + "epoch": 3.0857489755653362, + "grad_norm": 0.7381608486175537, + "learning_rate": 4.8587627822213224e-05, + "loss": 0.5143, + "step": 10166 + }, + { + "epoch": 3.0860525117620274, + "grad_norm": 0.9809612035751343, + "learning_rate": 4.8582565556343026e-05, + "loss": 1.1056, + "step": 10167 + }, + { + "epoch": 3.086356047958719, + "grad_norm": 1.061365008354187, + "learning_rate": 4.857750329047282e-05, + "loss": 0.7402, + "step": 10168 + }, + { + "epoch": 3.0866595841554103, + "grad_norm": 1.2772657871246338, + "learning_rate": 4.8572441024602616e-05, + "loss": 0.6158, + "step": 10169 + }, + { + "epoch": 3.086963120352102, + "grad_norm": 0.9810596108436584, + "learning_rate": 4.856737875873241e-05, + "loss": 0.8506, + "step": 10170 + }, + { + "epoch": 3.0872666565487936, + "grad_norm": 0.9208558201789856, + "learning_rate": 4.856231649286221e-05, + "loss": 0.8011, + "step": 10171 + }, + { + "epoch": 3.087570192745485, + "grad_norm": 0.7188460826873779, + "learning_rate": 4.855725422699201e-05, + "loss": 0.9826, + "step": 10172 + }, + { + "epoch": 3.0878737289421765, + "grad_norm": 1.0052639245986938, + "learning_rate": 4.85521919611218e-05, + "loss": 0.5962, + "step": 10173 + }, + { + "epoch": 3.0881772651388677, + "grad_norm": 1.3320200443267822, + "learning_rate": 4.8547129695251596e-05, + "loss": 1.0021, + "step": 10174 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.957373321056366, + "learning_rate": 4.854206742938139e-05, + "loss": 0.7447, + "step": 10175 + }, + { + "epoch": 3.0887843375322506, + "grad_norm": 0.9657913446426392, + "learning_rate": 4.853700516351119e-05, + "loss": 1.1573, + "step": 10176 + }, + { + "epoch": 3.0890878737289422, + "grad_norm": 1.0734162330627441, + "learning_rate": 4.853194289764099e-05, + "loss": 1.1698, + "step": 10177 + }, + { + "epoch": 3.0893914099256334, + "grad_norm": 1.0274368524551392, + "learning_rate": 4.852688063177078e-05, + "loss": 1.101, + "step": 10178 + }, + { + "epoch": 3.089694946122325, + "grad_norm": 0.9829404950141907, + "learning_rate": 4.852181836590058e-05, + "loss": 1.0085, + "step": 10179 + }, + { + "epoch": 3.0899984823190167, + "grad_norm": 0.8473483920097351, + "learning_rate": 4.851675610003037e-05, + "loss": 0.7356, + "step": 10180 + }, + { + "epoch": 3.090302018515708, + "grad_norm": 0.9783633351325989, + "learning_rate": 4.8511693834160173e-05, + "loss": 1.4656, + "step": 10181 + }, + { + "epoch": 3.0906055547123996, + "grad_norm": 1.140689492225647, + "learning_rate": 4.850663156828997e-05, + "loss": 0.8251, + "step": 10182 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 0.982058048248291, + "learning_rate": 4.850156930241976e-05, + "loss": 1.09, + "step": 10183 + }, + { + "epoch": 3.0912126271057825, + "grad_norm": 1.01748526096344, + "learning_rate": 4.849650703654956e-05, + "loss": 0.3319, + "step": 10184 + }, + { + "epoch": 3.0915161633024737, + "grad_norm": 0.8173680305480957, + "learning_rate": 4.849144477067936e-05, + "loss": 1.2472, + "step": 10185 + }, + { + "epoch": 3.0918196994991654, + "grad_norm": 1.1161898374557495, + "learning_rate": 4.8486382504809154e-05, + "loss": 1.0186, + "step": 10186 + }, + { + "epoch": 3.0921232356958566, + "grad_norm": 0.839145302772522, + "learning_rate": 4.848132023893895e-05, + "loss": 1.2504, + "step": 10187 + }, + { + "epoch": 3.092426771892548, + "grad_norm": 1.184670329093933, + "learning_rate": 4.847625797306875e-05, + "loss": 0.5871, + "step": 10188 + }, + { + "epoch": 3.0927303080892394, + "grad_norm": 0.9579082727432251, + "learning_rate": 4.8471195707198545e-05, + "loss": 0.5709, + "step": 10189 + }, + { + "epoch": 3.093033844285931, + "grad_norm": 1.0717566013336182, + "learning_rate": 4.846613344132834e-05, + "loss": 0.7079, + "step": 10190 + }, + { + "epoch": 3.0933373804826227, + "grad_norm": 1.128456473350525, + "learning_rate": 4.846107117545814e-05, + "loss": 0.9407, + "step": 10191 + }, + { + "epoch": 3.093640916679314, + "grad_norm": 1.162298560142517, + "learning_rate": 4.8456008909587937e-05, + "loss": 1.0897, + "step": 10192 + }, + { + "epoch": 3.0939444528760056, + "grad_norm": 0.9336274862289429, + "learning_rate": 4.845094664371773e-05, + "loss": 0.9561, + "step": 10193 + }, + { + "epoch": 3.094247989072697, + "grad_norm": 1.1214889287948608, + "learning_rate": 4.8445884377847526e-05, + "loss": 1.0063, + "step": 10194 + }, + { + "epoch": 3.0945515252693885, + "grad_norm": 1.038883924484253, + "learning_rate": 4.844082211197732e-05, + "loss": 1.3001, + "step": 10195 + }, + { + "epoch": 3.0948550614660797, + "grad_norm": 1.0163494348526, + "learning_rate": 4.843575984610712e-05, + "loss": 1.3321, + "step": 10196 + }, + { + "epoch": 3.0951585976627713, + "grad_norm": 1.0134986639022827, + "learning_rate": 4.843069758023692e-05, + "loss": 0.9723, + "step": 10197 + }, + { + "epoch": 3.0954621338594626, + "grad_norm": 0.8482567071914673, + "learning_rate": 4.842563531436671e-05, + "loss": 0.9114, + "step": 10198 + }, + { + "epoch": 3.095765670056154, + "grad_norm": 0.8050134778022766, + "learning_rate": 4.842057304849651e-05, + "loss": 1.0083, + "step": 10199 + }, + { + "epoch": 3.096069206252846, + "grad_norm": 0.8632974624633789, + "learning_rate": 4.841551078262631e-05, + "loss": 1.1357, + "step": 10200 + }, + { + "epoch": 3.096372742449537, + "grad_norm": 0.8956415057182312, + "learning_rate": 4.84104485167561e-05, + "loss": 1.1699, + "step": 10201 + }, + { + "epoch": 3.0966762786462287, + "grad_norm": 0.9562109112739563, + "learning_rate": 4.84053862508859e-05, + "loss": 0.9144, + "step": 10202 + }, + { + "epoch": 3.09697981484292, + "grad_norm": 0.8066214323043823, + "learning_rate": 4.840032398501569e-05, + "loss": 1.0127, + "step": 10203 + }, + { + "epoch": 3.0972833510396116, + "grad_norm": 1.0575859546661377, + "learning_rate": 4.839526171914549e-05, + "loss": 0.9995, + "step": 10204 + }, + { + "epoch": 3.097586887236303, + "grad_norm": 0.9900555610656738, + "learning_rate": 4.839019945327529e-05, + "loss": 1.2224, + "step": 10205 + }, + { + "epoch": 3.0978904234329945, + "grad_norm": 0.8321899175643921, + "learning_rate": 4.8385137187405084e-05, + "loss": 0.6701, + "step": 10206 + }, + { + "epoch": 3.0981939596296857, + "grad_norm": 0.9548383951187134, + "learning_rate": 4.838007492153488e-05, + "loss": 0.2594, + "step": 10207 + }, + { + "epoch": 3.0984974958263773, + "grad_norm": 0.937229335308075, + "learning_rate": 4.8375012655664674e-05, + "loss": 0.8891, + "step": 10208 + }, + { + "epoch": 3.0988010320230686, + "grad_norm": 1.152510404586792, + "learning_rate": 4.836995038979447e-05, + "loss": 1.5943, + "step": 10209 + }, + { + "epoch": 3.09910456821976, + "grad_norm": 1.3557219505310059, + "learning_rate": 4.836488812392428e-05, + "loss": 0.9124, + "step": 10210 + }, + { + "epoch": 3.099408104416452, + "grad_norm": 0.8536142706871033, + "learning_rate": 4.835982585805407e-05, + "loss": 1.5006, + "step": 10211 + }, + { + "epoch": 3.099711640613143, + "grad_norm": 0.7963231205940247, + "learning_rate": 4.8354763592183866e-05, + "loss": 1.4105, + "step": 10212 + }, + { + "epoch": 3.1000151768098347, + "grad_norm": 1.0872340202331543, + "learning_rate": 4.834970132631366e-05, + "loss": 0.824, + "step": 10213 + }, + { + "epoch": 3.100318713006526, + "grad_norm": 1.04911470413208, + "learning_rate": 4.8344639060443456e-05, + "loss": 0.9272, + "step": 10214 + }, + { + "epoch": 3.1006222492032176, + "grad_norm": 0.8675100803375244, + "learning_rate": 4.833957679457326e-05, + "loss": 1.0848, + "step": 10215 + }, + { + "epoch": 3.100925785399909, + "grad_norm": 0.7642478346824646, + "learning_rate": 4.833451452870305e-05, + "loss": 1.0753, + "step": 10216 + }, + { + "epoch": 3.1012293215966005, + "grad_norm": 1.0210033655166626, + "learning_rate": 4.832945226283285e-05, + "loss": 1.2022, + "step": 10217 + }, + { + "epoch": 3.1015328577932917, + "grad_norm": 0.7750602960586548, + "learning_rate": 4.832438999696264e-05, + "loss": 1.2958, + "step": 10218 + }, + { + "epoch": 3.1018363939899833, + "grad_norm": 0.8463450074195862, + "learning_rate": 4.831932773109244e-05, + "loss": 1.2487, + "step": 10219 + }, + { + "epoch": 3.1021399301866746, + "grad_norm": 1.0494881868362427, + "learning_rate": 4.831426546522224e-05, + "loss": 0.5179, + "step": 10220 + }, + { + "epoch": 3.102443466383366, + "grad_norm": 1.0515414476394653, + "learning_rate": 4.830920319935203e-05, + "loss": 0.7895, + "step": 10221 + }, + { + "epoch": 3.102747002580058, + "grad_norm": 1.2050992250442505, + "learning_rate": 4.830414093348183e-05, + "loss": 1.0161, + "step": 10222 + }, + { + "epoch": 3.103050538776749, + "grad_norm": 0.7721436619758606, + "learning_rate": 4.829907866761162e-05, + "loss": 1.2424, + "step": 10223 + }, + { + "epoch": 3.1033540749734407, + "grad_norm": 0.9620437622070312, + "learning_rate": 4.8294016401741424e-05, + "loss": 1.0933, + "step": 10224 + }, + { + "epoch": 3.103657611170132, + "grad_norm": 0.7904711961746216, + "learning_rate": 4.828895413587122e-05, + "loss": 1.1406, + "step": 10225 + }, + { + "epoch": 3.1039611473668236, + "grad_norm": 0.9906584024429321, + "learning_rate": 4.8283891870001014e-05, + "loss": 0.5739, + "step": 10226 + }, + { + "epoch": 3.104264683563515, + "grad_norm": 0.841254472732544, + "learning_rate": 4.827882960413081e-05, + "loss": 1.2472, + "step": 10227 + }, + { + "epoch": 3.1045682197602065, + "grad_norm": 0.970192551612854, + "learning_rate": 4.82737673382606e-05, + "loss": 1.0558, + "step": 10228 + }, + { + "epoch": 3.1048717559568977, + "grad_norm": 1.000087022781372, + "learning_rate": 4.8268705072390405e-05, + "loss": 0.8925, + "step": 10229 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.9151729345321655, + "learning_rate": 4.82636428065202e-05, + "loss": 0.744, + "step": 10230 + }, + { + "epoch": 3.1054788283502806, + "grad_norm": 0.9074206352233887, + "learning_rate": 4.8258580540649994e-05, + "loss": 0.8251, + "step": 10231 + }, + { + "epoch": 3.105782364546972, + "grad_norm": 0.7003865838050842, + "learning_rate": 4.8253518274779796e-05, + "loss": 0.6674, + "step": 10232 + }, + { + "epoch": 3.106085900743664, + "grad_norm": 0.9106037616729736, + "learning_rate": 4.824845600890959e-05, + "loss": 0.7907, + "step": 10233 + }, + { + "epoch": 3.106389436940355, + "grad_norm": 1.119046688079834, + "learning_rate": 4.8243393743039386e-05, + "loss": 0.6678, + "step": 10234 + }, + { + "epoch": 3.1066929731370467, + "grad_norm": 1.0848722457885742, + "learning_rate": 4.823833147716919e-05, + "loss": 0.7284, + "step": 10235 + }, + { + "epoch": 3.106996509333738, + "grad_norm": 1.070232629776001, + "learning_rate": 4.823326921129898e-05, + "loss": 1.006, + "step": 10236 + }, + { + "epoch": 3.1073000455304296, + "grad_norm": 0.900347113609314, + "learning_rate": 4.822820694542878e-05, + "loss": 0.9861, + "step": 10237 + }, + { + "epoch": 3.107603581727121, + "grad_norm": 0.8176512718200684, + "learning_rate": 4.822314467955857e-05, + "loss": 1.3572, + "step": 10238 + }, + { + "epoch": 3.1079071179238125, + "grad_norm": 1.2676304578781128, + "learning_rate": 4.821808241368837e-05, + "loss": 0.8002, + "step": 10239 + }, + { + "epoch": 3.1082106541205037, + "grad_norm": 0.7571612000465393, + "learning_rate": 4.821302014781817e-05, + "loss": 1.2142, + "step": 10240 + }, + { + "epoch": 3.1085141903171953, + "grad_norm": 0.8713750839233398, + "learning_rate": 4.820795788194796e-05, + "loss": 0.529, + "step": 10241 + }, + { + "epoch": 3.108817726513887, + "grad_norm": 0.8529492616653442, + "learning_rate": 4.820289561607776e-05, + "loss": 0.6161, + "step": 10242 + }, + { + "epoch": 3.109121262710578, + "grad_norm": 0.8320578336715698, + "learning_rate": 4.819783335020755e-05, + "loss": 1.1965, + "step": 10243 + }, + { + "epoch": 3.10942479890727, + "grad_norm": 1.3196736574172974, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.895, + "step": 10244 + }, + { + "epoch": 3.109728335103961, + "grad_norm": 1.0062475204467773, + "learning_rate": 4.818770881846715e-05, + "loss": 0.6848, + "step": 10245 + }, + { + "epoch": 3.1100318713006527, + "grad_norm": 1.102092981338501, + "learning_rate": 4.8182646552596944e-05, + "loss": 0.6758, + "step": 10246 + }, + { + "epoch": 3.110335407497344, + "grad_norm": 1.0226787328720093, + "learning_rate": 4.817758428672674e-05, + "loss": 0.7453, + "step": 10247 + }, + { + "epoch": 3.1106389436940356, + "grad_norm": 1.014999270439148, + "learning_rate": 4.817252202085653e-05, + "loss": 1.0812, + "step": 10248 + }, + { + "epoch": 3.110942479890727, + "grad_norm": 0.887249231338501, + "learning_rate": 4.8167459754986335e-05, + "loss": 0.9665, + "step": 10249 + }, + { + "epoch": 3.1112460160874185, + "grad_norm": 1.070145606994629, + "learning_rate": 4.816239748911613e-05, + "loss": 0.8304, + "step": 10250 + }, + { + "epoch": 3.1115495522841097, + "grad_norm": 1.0190749168395996, + "learning_rate": 4.8157335223245924e-05, + "loss": 0.8443, + "step": 10251 + }, + { + "epoch": 3.1118530884808013, + "grad_norm": 1.037696123123169, + "learning_rate": 4.815227295737572e-05, + "loss": 1.133, + "step": 10252 + }, + { + "epoch": 3.112156624677493, + "grad_norm": 1.0444797277450562, + "learning_rate": 4.814721069150552e-05, + "loss": 0.771, + "step": 10253 + }, + { + "epoch": 3.112460160874184, + "grad_norm": 1.1078153848648071, + "learning_rate": 4.8142148425635315e-05, + "loss": 0.8704, + "step": 10254 + }, + { + "epoch": 3.112763697070876, + "grad_norm": 1.1167354583740234, + "learning_rate": 4.813708615976512e-05, + "loss": 0.6685, + "step": 10255 + }, + { + "epoch": 3.113067233267567, + "grad_norm": 0.9682241082191467, + "learning_rate": 4.813202389389491e-05, + "loss": 1.3158, + "step": 10256 + }, + { + "epoch": 3.1133707694642587, + "grad_norm": 1.1315803527832031, + "learning_rate": 4.8126961628024707e-05, + "loss": 0.8792, + "step": 10257 + }, + { + "epoch": 3.11367430566095, + "grad_norm": 0.8099128007888794, + "learning_rate": 4.81218993621545e-05, + "loss": 1.321, + "step": 10258 + }, + { + "epoch": 3.1139778418576416, + "grad_norm": 0.9294849634170532, + "learning_rate": 4.81168370962843e-05, + "loss": 0.9677, + "step": 10259 + }, + { + "epoch": 3.114281378054333, + "grad_norm": 0.9847825765609741, + "learning_rate": 4.81117748304141e-05, + "loss": 0.93, + "step": 10260 + }, + { + "epoch": 3.1145849142510245, + "grad_norm": 1.1325057744979858, + "learning_rate": 4.810671256454389e-05, + "loss": 0.9952, + "step": 10261 + }, + { + "epoch": 3.114888450447716, + "grad_norm": 0.9496695399284363, + "learning_rate": 4.810165029867369e-05, + "loss": 1.0787, + "step": 10262 + }, + { + "epoch": 3.1151919866444073, + "grad_norm": 1.022805094718933, + "learning_rate": 4.809658803280349e-05, + "loss": 0.5554, + "step": 10263 + }, + { + "epoch": 3.115495522841099, + "grad_norm": 0.956490159034729, + "learning_rate": 4.8091525766933284e-05, + "loss": 0.7193, + "step": 10264 + }, + { + "epoch": 3.11579905903779, + "grad_norm": 1.1204911470413208, + "learning_rate": 4.808646350106308e-05, + "loss": 1.0088, + "step": 10265 + }, + { + "epoch": 3.116102595234482, + "grad_norm": 0.8812878131866455, + "learning_rate": 4.808140123519287e-05, + "loss": 0.6885, + "step": 10266 + }, + { + "epoch": 3.116406131431173, + "grad_norm": 0.9081305265426636, + "learning_rate": 4.807633896932267e-05, + "loss": 0.905, + "step": 10267 + }, + { + "epoch": 3.1167096676278647, + "grad_norm": 1.2258888483047485, + "learning_rate": 4.807127670345247e-05, + "loss": 0.6928, + "step": 10268 + }, + { + "epoch": 3.117013203824556, + "grad_norm": 0.8976696729660034, + "learning_rate": 4.8066214437582264e-05, + "loss": 1.0546, + "step": 10269 + }, + { + "epoch": 3.1173167400212476, + "grad_norm": 1.2638503313064575, + "learning_rate": 4.806115217171206e-05, + "loss": 1.0135, + "step": 10270 + }, + { + "epoch": 3.117620276217939, + "grad_norm": 1.125235676765442, + "learning_rate": 4.8056089905841854e-05, + "loss": 0.9165, + "step": 10271 + }, + { + "epoch": 3.1179238124146305, + "grad_norm": 1.2102668285369873, + "learning_rate": 4.805102763997165e-05, + "loss": 0.5326, + "step": 10272 + }, + { + "epoch": 3.118227348611322, + "grad_norm": 0.6885108351707458, + "learning_rate": 4.804596537410145e-05, + "loss": 0.6967, + "step": 10273 + }, + { + "epoch": 3.1185308848080133, + "grad_norm": 1.2470431327819824, + "learning_rate": 4.8040903108231245e-05, + "loss": 0.4883, + "step": 10274 + }, + { + "epoch": 3.118834421004705, + "grad_norm": 0.6235316395759583, + "learning_rate": 4.803584084236104e-05, + "loss": 1.0372, + "step": 10275 + }, + { + "epoch": 3.119137957201396, + "grad_norm": 0.9666857719421387, + "learning_rate": 4.8030778576490835e-05, + "loss": 0.7957, + "step": 10276 + }, + { + "epoch": 3.119441493398088, + "grad_norm": 0.9099533557891846, + "learning_rate": 4.8025716310620636e-05, + "loss": 0.9213, + "step": 10277 + }, + { + "epoch": 3.119745029594779, + "grad_norm": 0.8261551856994629, + "learning_rate": 4.802065404475044e-05, + "loss": 1.2769, + "step": 10278 + }, + { + "epoch": 3.1200485657914707, + "grad_norm": 0.9195857048034668, + "learning_rate": 4.801559177888023e-05, + "loss": 0.5379, + "step": 10279 + }, + { + "epoch": 3.120352101988162, + "grad_norm": 0.9171074628829956, + "learning_rate": 4.801052951301003e-05, + "loss": 1.0875, + "step": 10280 + }, + { + "epoch": 3.1206556381848536, + "grad_norm": 0.9456160664558411, + "learning_rate": 4.800546724713982e-05, + "loss": 1.0587, + "step": 10281 + }, + { + "epoch": 3.120959174381545, + "grad_norm": 0.815257728099823, + "learning_rate": 4.800040498126962e-05, + "loss": 0.8471, + "step": 10282 + }, + { + "epoch": 3.1212627105782365, + "grad_norm": 0.9128398895263672, + "learning_rate": 4.799534271539942e-05, + "loss": 0.6253, + "step": 10283 + }, + { + "epoch": 3.121566246774928, + "grad_norm": 0.9814451336860657, + "learning_rate": 4.7990280449529213e-05, + "loss": 1.0832, + "step": 10284 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.9476622939109802, + "learning_rate": 4.798521818365901e-05, + "loss": 1.0724, + "step": 10285 + }, + { + "epoch": 3.122173319168311, + "grad_norm": 1.1996830701828003, + "learning_rate": 4.79801559177888e-05, + "loss": 0.9023, + "step": 10286 + }, + { + "epoch": 3.122476855365002, + "grad_norm": 0.9497719407081604, + "learning_rate": 4.79750936519186e-05, + "loss": 0.7901, + "step": 10287 + }, + { + "epoch": 3.122780391561694, + "grad_norm": 1.2093054056167603, + "learning_rate": 4.79700313860484e-05, + "loss": 0.5775, + "step": 10288 + }, + { + "epoch": 3.123083927758385, + "grad_norm": 1.0983867645263672, + "learning_rate": 4.7964969120178194e-05, + "loss": 0.8341, + "step": 10289 + }, + { + "epoch": 3.1233874639550767, + "grad_norm": 0.8961772918701172, + "learning_rate": 4.795990685430799e-05, + "loss": 0.6727, + "step": 10290 + }, + { + "epoch": 3.123691000151768, + "grad_norm": 1.1636279821395874, + "learning_rate": 4.7954844588437784e-05, + "loss": 0.6785, + "step": 10291 + }, + { + "epoch": 3.1239945363484596, + "grad_norm": 1.07382333278656, + "learning_rate": 4.7949782322567585e-05, + "loss": 0.6627, + "step": 10292 + }, + { + "epoch": 3.124298072545151, + "grad_norm": 1.058521032333374, + "learning_rate": 4.794472005669738e-05, + "loss": 0.9511, + "step": 10293 + }, + { + "epoch": 3.1246016087418425, + "grad_norm": 0.9444019794464111, + "learning_rate": 4.7939657790827175e-05, + "loss": 0.5904, + "step": 10294 + }, + { + "epoch": 3.124905144938534, + "grad_norm": 1.1124404668807983, + "learning_rate": 4.793459552495697e-05, + "loss": 0.9511, + "step": 10295 + }, + { + "epoch": 3.1252086811352253, + "grad_norm": 1.0147475004196167, + "learning_rate": 4.7929533259086765e-05, + "loss": 0.7573, + "step": 10296 + }, + { + "epoch": 3.125512217331917, + "grad_norm": 1.0679386854171753, + "learning_rate": 4.7924470993216566e-05, + "loss": 0.7939, + "step": 10297 + }, + { + "epoch": 3.125815753528608, + "grad_norm": 0.8408347368240356, + "learning_rate": 4.791940872734636e-05, + "loss": 1.5702, + "step": 10298 + }, + { + "epoch": 3.1261192897253, + "grad_norm": 0.8705650568008423, + "learning_rate": 4.791434646147616e-05, + "loss": 1.3437, + "step": 10299 + }, + { + "epoch": 3.126422825921991, + "grad_norm": 0.8601013422012329, + "learning_rate": 4.790928419560596e-05, + "loss": 0.9736, + "step": 10300 + }, + { + "epoch": 3.1267263621186827, + "grad_norm": 1.0429027080535889, + "learning_rate": 4.790422192973575e-05, + "loss": 1.008, + "step": 10301 + }, + { + "epoch": 3.127029898315374, + "grad_norm": 0.9696634411811829, + "learning_rate": 4.7899159663865554e-05, + "loss": 1.0081, + "step": 10302 + }, + { + "epoch": 3.1273334345120656, + "grad_norm": 1.1295794248580933, + "learning_rate": 4.789409739799535e-05, + "loss": 0.5938, + "step": 10303 + }, + { + "epoch": 3.127636970708757, + "grad_norm": 1.054425597190857, + "learning_rate": 4.788903513212514e-05, + "loss": 1.0564, + "step": 10304 + }, + { + "epoch": 3.1279405069054484, + "grad_norm": 0.9547919631004333, + "learning_rate": 4.788397286625494e-05, + "loss": 1.0656, + "step": 10305 + }, + { + "epoch": 3.12824404310214, + "grad_norm": 0.9968164563179016, + "learning_rate": 4.787891060038473e-05, + "loss": 0.8413, + "step": 10306 + }, + { + "epoch": 3.1285475792988313, + "grad_norm": 0.9305695295333862, + "learning_rate": 4.7873848334514534e-05, + "loss": 0.318, + "step": 10307 + }, + { + "epoch": 3.128851115495523, + "grad_norm": 1.1401594877243042, + "learning_rate": 4.786878606864433e-05, + "loss": 0.7914, + "step": 10308 + }, + { + "epoch": 3.129154651692214, + "grad_norm": 1.0903087854385376, + "learning_rate": 4.7863723802774124e-05, + "loss": 1.0287, + "step": 10309 + }, + { + "epoch": 3.129458187888906, + "grad_norm": 0.8761926293373108, + "learning_rate": 4.785866153690392e-05, + "loss": 1.2994, + "step": 10310 + }, + { + "epoch": 3.129761724085597, + "grad_norm": 1.1542888879776, + "learning_rate": 4.7853599271033714e-05, + "loss": 0.6933, + "step": 10311 + }, + { + "epoch": 3.1300652602822887, + "grad_norm": 0.9919973015785217, + "learning_rate": 4.7848537005163515e-05, + "loss": 0.3055, + "step": 10312 + }, + { + "epoch": 3.13036879647898, + "grad_norm": 1.0525269508361816, + "learning_rate": 4.784347473929331e-05, + "loss": 1.0897, + "step": 10313 + }, + { + "epoch": 3.1306723326756716, + "grad_norm": 1.2305265665054321, + "learning_rate": 4.7838412473423105e-05, + "loss": 0.5867, + "step": 10314 + }, + { + "epoch": 3.1309758688723632, + "grad_norm": 0.8726613521575928, + "learning_rate": 4.78333502075529e-05, + "loss": 1.0343, + "step": 10315 + }, + { + "epoch": 3.1312794050690544, + "grad_norm": 1.1548607349395752, + "learning_rate": 4.78282879416827e-05, + "loss": 1.2719, + "step": 10316 + }, + { + "epoch": 3.131582941265746, + "grad_norm": 0.8539901375770569, + "learning_rate": 4.7823225675812496e-05, + "loss": 0.9723, + "step": 10317 + }, + { + "epoch": 3.1318864774624373, + "grad_norm": 1.0392454862594604, + "learning_rate": 4.781816340994229e-05, + "loss": 0.7059, + "step": 10318 + }, + { + "epoch": 3.132190013659129, + "grad_norm": 1.0322120189666748, + "learning_rate": 4.7813101144072085e-05, + "loss": 0.9134, + "step": 10319 + }, + { + "epoch": 3.13249354985582, + "grad_norm": 0.9999468922615051, + "learning_rate": 4.780803887820188e-05, + "loss": 0.5306, + "step": 10320 + }, + { + "epoch": 3.132797086052512, + "grad_norm": 1.1880388259887695, + "learning_rate": 4.780297661233168e-05, + "loss": 1.0748, + "step": 10321 + }, + { + "epoch": 3.133100622249203, + "grad_norm": 0.8774686455726624, + "learning_rate": 4.7797914346461483e-05, + "loss": 0.8053, + "step": 10322 + }, + { + "epoch": 3.1334041584458947, + "grad_norm": 0.8681316375732422, + "learning_rate": 4.779285208059128e-05, + "loss": 0.7412, + "step": 10323 + }, + { + "epoch": 3.1337076946425864, + "grad_norm": 1.1811221837997437, + "learning_rate": 4.778778981472107e-05, + "loss": 0.6731, + "step": 10324 + }, + { + "epoch": 3.1340112308392776, + "grad_norm": 0.9811843037605286, + "learning_rate": 4.778272754885087e-05, + "loss": 1.005, + "step": 10325 + }, + { + "epoch": 3.1343147670359692, + "grad_norm": 0.9603432416915894, + "learning_rate": 4.777766528298066e-05, + "loss": 0.5503, + "step": 10326 + }, + { + "epoch": 3.1346183032326604, + "grad_norm": 1.0307477712631226, + "learning_rate": 4.7772603017110464e-05, + "loss": 0.7452, + "step": 10327 + }, + { + "epoch": 3.134921839429352, + "grad_norm": 0.6428546905517578, + "learning_rate": 4.776754075124026e-05, + "loss": 1.0087, + "step": 10328 + }, + { + "epoch": 3.1352253756260433, + "grad_norm": 1.0561267137527466, + "learning_rate": 4.7762478485370054e-05, + "loss": 1.0308, + "step": 10329 + }, + { + "epoch": 3.135528911822735, + "grad_norm": 0.8425313234329224, + "learning_rate": 4.775741621949985e-05, + "loss": 0.6882, + "step": 10330 + }, + { + "epoch": 3.135832448019426, + "grad_norm": 0.9832221865653992, + "learning_rate": 4.775235395362965e-05, + "loss": 1.0857, + "step": 10331 + }, + { + "epoch": 3.136135984216118, + "grad_norm": 0.8301721215248108, + "learning_rate": 4.7747291687759445e-05, + "loss": 1.2715, + "step": 10332 + }, + { + "epoch": 3.136439520412809, + "grad_norm": 1.163613200187683, + "learning_rate": 4.774222942188924e-05, + "loss": 0.888, + "step": 10333 + }, + { + "epoch": 3.1367430566095007, + "grad_norm": 1.1844435930252075, + "learning_rate": 4.7737167156019034e-05, + "loss": 0.8255, + "step": 10334 + }, + { + "epoch": 3.1370465928061924, + "grad_norm": 1.109598994255066, + "learning_rate": 4.773210489014883e-05, + "loss": 0.6234, + "step": 10335 + }, + { + "epoch": 3.1373501290028836, + "grad_norm": 1.1785974502563477, + "learning_rate": 4.772704262427863e-05, + "loss": 0.6707, + "step": 10336 + }, + { + "epoch": 3.1376536651995752, + "grad_norm": 1.2761236429214478, + "learning_rate": 4.7721980358408426e-05, + "loss": 0.7895, + "step": 10337 + }, + { + "epoch": 3.1379572013962664, + "grad_norm": 0.7199037075042725, + "learning_rate": 4.771691809253822e-05, + "loss": 0.471, + "step": 10338 + }, + { + "epoch": 3.138260737592958, + "grad_norm": 0.9055795669555664, + "learning_rate": 4.7711855826668015e-05, + "loss": 1.4064, + "step": 10339 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 0.6938846111297607, + "learning_rate": 4.770679356079781e-05, + "loss": 0.8794, + "step": 10340 + }, + { + "epoch": 3.138867809986341, + "grad_norm": 0.6006743311882019, + "learning_rate": 4.770173129492761e-05, + "loss": 1.2541, + "step": 10341 + }, + { + "epoch": 3.139171346183032, + "grad_norm": 1.1253604888916016, + "learning_rate": 4.7696669029057406e-05, + "loss": 1.0626, + "step": 10342 + }, + { + "epoch": 3.139474882379724, + "grad_norm": 0.8586475849151611, + "learning_rate": 4.76916067631872e-05, + "loss": 1.2298, + "step": 10343 + }, + { + "epoch": 3.139778418576415, + "grad_norm": 1.0116828680038452, + "learning_rate": 4.7686544497317e-05, + "loss": 0.9243, + "step": 10344 + }, + { + "epoch": 3.1400819547731067, + "grad_norm": 1.025334119796753, + "learning_rate": 4.76814822314468e-05, + "loss": 1.1037, + "step": 10345 + }, + { + "epoch": 3.1403854909697984, + "grad_norm": 1.1306514739990234, + "learning_rate": 4.76764199655766e-05, + "loss": 0.5638, + "step": 10346 + }, + { + "epoch": 3.1406890271664896, + "grad_norm": 1.0380830764770508, + "learning_rate": 4.7671357699706394e-05, + "loss": 1.2348, + "step": 10347 + }, + { + "epoch": 3.1409925633631812, + "grad_norm": 1.0046030282974243, + "learning_rate": 4.766629543383619e-05, + "loss": 0.8616, + "step": 10348 + }, + { + "epoch": 3.1412960995598724, + "grad_norm": 0.8449072241783142, + "learning_rate": 4.7661233167965984e-05, + "loss": 0.494, + "step": 10349 + }, + { + "epoch": 3.141599635756564, + "grad_norm": 1.1011278629302979, + "learning_rate": 4.765617090209578e-05, + "loss": 0.4242, + "step": 10350 + }, + { + "epoch": 3.1419031719532553, + "grad_norm": 1.1555718183517456, + "learning_rate": 4.765110863622558e-05, + "loss": 0.5409, + "step": 10351 + }, + { + "epoch": 3.142206708149947, + "grad_norm": 0.7693044543266296, + "learning_rate": 4.7646046370355375e-05, + "loss": 0.8941, + "step": 10352 + }, + { + "epoch": 3.142510244346638, + "grad_norm": 1.0779861211776733, + "learning_rate": 4.764098410448517e-05, + "loss": 0.5592, + "step": 10353 + }, + { + "epoch": 3.14281378054333, + "grad_norm": 0.9634519815444946, + "learning_rate": 4.7635921838614964e-05, + "loss": 1.2483, + "step": 10354 + }, + { + "epoch": 3.143117316740021, + "grad_norm": 0.9396659731864929, + "learning_rate": 4.7630859572744766e-05, + "loss": 0.6706, + "step": 10355 + }, + { + "epoch": 3.1434208529367127, + "grad_norm": 0.9157540798187256, + "learning_rate": 4.762579730687456e-05, + "loss": 0.8203, + "step": 10356 + }, + { + "epoch": 3.1437243891334044, + "grad_norm": 1.2979938983917236, + "learning_rate": 4.7620735041004355e-05, + "loss": 0.8856, + "step": 10357 + }, + { + "epoch": 3.1440279253300956, + "grad_norm": 1.013296365737915, + "learning_rate": 4.761567277513415e-05, + "loss": 0.9367, + "step": 10358 + }, + { + "epoch": 3.144331461526787, + "grad_norm": 1.0508990287780762, + "learning_rate": 4.7610610509263945e-05, + "loss": 0.7957, + "step": 10359 + }, + { + "epoch": 3.1446349977234784, + "grad_norm": 1.072538137435913, + "learning_rate": 4.7605548243393747e-05, + "loss": 0.9325, + "step": 10360 + }, + { + "epoch": 3.14493853392017, + "grad_norm": 0.9005856513977051, + "learning_rate": 4.760048597752354e-05, + "loss": 0.7038, + "step": 10361 + }, + { + "epoch": 3.1452420701168613, + "grad_norm": 0.8835384249687195, + "learning_rate": 4.7595423711653336e-05, + "loss": 1.3903, + "step": 10362 + }, + { + "epoch": 3.145545606313553, + "grad_norm": 0.9588941335678101, + "learning_rate": 4.759036144578313e-05, + "loss": 1.2963, + "step": 10363 + }, + { + "epoch": 3.145849142510244, + "grad_norm": 1.0200319290161133, + "learning_rate": 4.7585299179912926e-05, + "loss": 1.0434, + "step": 10364 + }, + { + "epoch": 3.146152678706936, + "grad_norm": 1.1441307067871094, + "learning_rate": 4.758023691404273e-05, + "loss": 1.0135, + "step": 10365 + }, + { + "epoch": 3.146456214903627, + "grad_norm": 1.2910655736923218, + "learning_rate": 4.757517464817253e-05, + "loss": 0.9647, + "step": 10366 + }, + { + "epoch": 3.1467597511003187, + "grad_norm": 0.6326399445533752, + "learning_rate": 4.7570112382302324e-05, + "loss": 1.0226, + "step": 10367 + }, + { + "epoch": 3.1470632872970103, + "grad_norm": 0.8939344882965088, + "learning_rate": 4.756505011643212e-05, + "loss": 0.4913, + "step": 10368 + }, + { + "epoch": 3.1473668234937016, + "grad_norm": 0.8687292337417603, + "learning_rate": 4.755998785056191e-05, + "loss": 1.0922, + "step": 10369 + }, + { + "epoch": 3.147670359690393, + "grad_norm": 1.1645346879959106, + "learning_rate": 4.7554925584691715e-05, + "loss": 0.932, + "step": 10370 + }, + { + "epoch": 3.1479738958870844, + "grad_norm": 0.9731719493865967, + "learning_rate": 4.754986331882151e-05, + "loss": 0.8353, + "step": 10371 + }, + { + "epoch": 3.148277432083776, + "grad_norm": 0.8632399439811707, + "learning_rate": 4.7544801052951304e-05, + "loss": 1.0655, + "step": 10372 + }, + { + "epoch": 3.1485809682804673, + "grad_norm": 0.9961166381835938, + "learning_rate": 4.75397387870811e-05, + "loss": 0.5428, + "step": 10373 + }, + { + "epoch": 3.148884504477159, + "grad_norm": 0.8651885390281677, + "learning_rate": 4.7534676521210894e-05, + "loss": 0.8326, + "step": 10374 + }, + { + "epoch": 3.14918804067385, + "grad_norm": 1.0654666423797607, + "learning_rate": 4.7529614255340696e-05, + "loss": 0.8226, + "step": 10375 + }, + { + "epoch": 3.149491576870542, + "grad_norm": 1.018639087677002, + "learning_rate": 4.752455198947049e-05, + "loss": 0.7476, + "step": 10376 + }, + { + "epoch": 3.149795113067233, + "grad_norm": 0.7021937370300293, + "learning_rate": 4.7519489723600285e-05, + "loss": 0.6445, + "step": 10377 + }, + { + "epoch": 3.1500986492639247, + "grad_norm": 0.9287295341491699, + "learning_rate": 4.751442745773008e-05, + "loss": 0.4259, + "step": 10378 + }, + { + "epoch": 3.1504021854606163, + "grad_norm": 1.0464659929275513, + "learning_rate": 4.7509365191859875e-05, + "loss": 1.0301, + "step": 10379 + }, + { + "epoch": 3.1507057216573076, + "grad_norm": 1.0281345844268799, + "learning_rate": 4.7504302925989676e-05, + "loss": 0.7104, + "step": 10380 + }, + { + "epoch": 3.151009257853999, + "grad_norm": 1.205594778060913, + "learning_rate": 4.749924066011947e-05, + "loss": 1.1167, + "step": 10381 + }, + { + "epoch": 3.1513127940506904, + "grad_norm": 1.0006812810897827, + "learning_rate": 4.7494178394249266e-05, + "loss": 1.1696, + "step": 10382 + }, + { + "epoch": 3.151616330247382, + "grad_norm": 1.0473226308822632, + "learning_rate": 4.748911612837906e-05, + "loss": 1.046, + "step": 10383 + }, + { + "epoch": 3.1519198664440733, + "grad_norm": 1.2342840433120728, + "learning_rate": 4.748405386250886e-05, + "loss": 0.9611, + "step": 10384 + }, + { + "epoch": 3.152223402640765, + "grad_norm": 0.9438900947570801, + "learning_rate": 4.747899159663866e-05, + "loss": 0.9055, + "step": 10385 + }, + { + "epoch": 3.152526938837456, + "grad_norm": 0.8948186635971069, + "learning_rate": 4.747392933076845e-05, + "loss": 0.4316, + "step": 10386 + }, + { + "epoch": 3.152830475034148, + "grad_norm": 0.929832935333252, + "learning_rate": 4.746886706489825e-05, + "loss": 0.7834, + "step": 10387 + }, + { + "epoch": 3.1531340112308395, + "grad_norm": 0.8663754463195801, + "learning_rate": 4.746380479902805e-05, + "loss": 1.3126, + "step": 10388 + }, + { + "epoch": 3.1534375474275307, + "grad_norm": 1.087110161781311, + "learning_rate": 4.745874253315784e-05, + "loss": 1.1132, + "step": 10389 + }, + { + "epoch": 3.1537410836242223, + "grad_norm": 0.9091704487800598, + "learning_rate": 4.7453680267287645e-05, + "loss": 1.3182, + "step": 10390 + }, + { + "epoch": 3.1540446198209136, + "grad_norm": 0.8646488785743713, + "learning_rate": 4.744861800141744e-05, + "loss": 0.8117, + "step": 10391 + }, + { + "epoch": 3.154348156017605, + "grad_norm": 1.0429656505584717, + "learning_rate": 4.7443555735547234e-05, + "loss": 0.7562, + "step": 10392 + }, + { + "epoch": 3.1546516922142964, + "grad_norm": 0.8698205947875977, + "learning_rate": 4.743849346967703e-05, + "loss": 0.4087, + "step": 10393 + }, + { + "epoch": 3.154955228410988, + "grad_norm": 0.9110996723175049, + "learning_rate": 4.743343120380683e-05, + "loss": 1.1237, + "step": 10394 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 1.2024277448654175, + "learning_rate": 4.7428368937936625e-05, + "loss": 1.2586, + "step": 10395 + }, + { + "epoch": 3.155562300804371, + "grad_norm": 0.9435782432556152, + "learning_rate": 4.742330667206642e-05, + "loss": 1.0678, + "step": 10396 + }, + { + "epoch": 3.1558658370010626, + "grad_norm": 0.6518744230270386, + "learning_rate": 4.7418244406196215e-05, + "loss": 0.8844, + "step": 10397 + }, + { + "epoch": 3.156169373197754, + "grad_norm": 1.1847448348999023, + "learning_rate": 4.741318214032601e-05, + "loss": 0.7989, + "step": 10398 + }, + { + "epoch": 3.1564729093944455, + "grad_norm": 0.9844532608985901, + "learning_rate": 4.740811987445581e-05, + "loss": 0.8846, + "step": 10399 + }, + { + "epoch": 3.1567764455911367, + "grad_norm": 0.8663511872291565, + "learning_rate": 4.7403057608585606e-05, + "loss": 1.2416, + "step": 10400 + }, + { + "epoch": 3.1570799817878283, + "grad_norm": 1.213365912437439, + "learning_rate": 4.73979953427154e-05, + "loss": 0.7979, + "step": 10401 + }, + { + "epoch": 3.1573835179845195, + "grad_norm": 0.892488420009613, + "learning_rate": 4.7392933076845196e-05, + "loss": 1.4371, + "step": 10402 + }, + { + "epoch": 3.157687054181211, + "grad_norm": 1.232032060623169, + "learning_rate": 4.738787081097499e-05, + "loss": 0.8251, + "step": 10403 + }, + { + "epoch": 3.1579905903779024, + "grad_norm": 0.8439121246337891, + "learning_rate": 4.738280854510479e-05, + "loss": 1.3247, + "step": 10404 + }, + { + "epoch": 3.158294126574594, + "grad_norm": 0.9805055260658264, + "learning_rate": 4.737774627923459e-05, + "loss": 1.0891, + "step": 10405 + }, + { + "epoch": 3.1585976627712853, + "grad_norm": 1.115586757659912, + "learning_rate": 4.737268401336438e-05, + "loss": 1.0207, + "step": 10406 + }, + { + "epoch": 3.158901198967977, + "grad_norm": 0.8928739428520203, + "learning_rate": 4.7367621747494176e-05, + "loss": 1.2227, + "step": 10407 + }, + { + "epoch": 3.1592047351646686, + "grad_norm": 0.9928413033485413, + "learning_rate": 4.736255948162398e-05, + "loss": 0.9294, + "step": 10408 + }, + { + "epoch": 3.15950827136136, + "grad_norm": 0.8878251910209656, + "learning_rate": 4.735749721575377e-05, + "loss": 1.001, + "step": 10409 + }, + { + "epoch": 3.1598118075580515, + "grad_norm": 1.1200608015060425, + "learning_rate": 4.7352434949883574e-05, + "loss": 0.83, + "step": 10410 + }, + { + "epoch": 3.1601153437547427, + "grad_norm": 0.8669918179512024, + "learning_rate": 4.734737268401337e-05, + "loss": 1.1585, + "step": 10411 + }, + { + "epoch": 3.1604188799514343, + "grad_norm": 1.0759658813476562, + "learning_rate": 4.7342310418143164e-05, + "loss": 1.2066, + "step": 10412 + }, + { + "epoch": 3.1607224161481255, + "grad_norm": 0.9969993233680725, + "learning_rate": 4.733724815227296e-05, + "loss": 0.7108, + "step": 10413 + }, + { + "epoch": 3.161025952344817, + "grad_norm": 1.0586049556732178, + "learning_rate": 4.733218588640276e-05, + "loss": 0.8418, + "step": 10414 + }, + { + "epoch": 3.1613294885415084, + "grad_norm": 1.1259765625, + "learning_rate": 4.7327123620532555e-05, + "loss": 0.8785, + "step": 10415 + }, + { + "epoch": 3.1616330247382, + "grad_norm": 1.2025822401046753, + "learning_rate": 4.732206135466235e-05, + "loss": 0.8841, + "step": 10416 + }, + { + "epoch": 3.1619365609348913, + "grad_norm": 1.0927627086639404, + "learning_rate": 4.7316999088792145e-05, + "loss": 0.6464, + "step": 10417 + }, + { + "epoch": 3.162240097131583, + "grad_norm": 0.8815931677818298, + "learning_rate": 4.731193682292194e-05, + "loss": 0.8074, + "step": 10418 + }, + { + "epoch": 3.1625436333282746, + "grad_norm": 1.1449110507965088, + "learning_rate": 4.730687455705174e-05, + "loss": 0.8631, + "step": 10419 + }, + { + "epoch": 3.162847169524966, + "grad_norm": 1.1215803623199463, + "learning_rate": 4.7301812291181536e-05, + "loss": 0.7708, + "step": 10420 + }, + { + "epoch": 3.1631507057216575, + "grad_norm": 0.8856171369552612, + "learning_rate": 4.729675002531133e-05, + "loss": 0.9642, + "step": 10421 + }, + { + "epoch": 3.1634542419183487, + "grad_norm": 1.2143223285675049, + "learning_rate": 4.7291687759441125e-05, + "loss": 0.7203, + "step": 10422 + }, + { + "epoch": 3.1637577781150403, + "grad_norm": 1.018662452697754, + "learning_rate": 4.728662549357093e-05, + "loss": 0.8966, + "step": 10423 + }, + { + "epoch": 3.1640613143117315, + "grad_norm": 0.9518203139305115, + "learning_rate": 4.728156322770072e-05, + "loss": 1.0437, + "step": 10424 + }, + { + "epoch": 3.164364850508423, + "grad_norm": 0.6670674085617065, + "learning_rate": 4.727650096183052e-05, + "loss": 0.6696, + "step": 10425 + }, + { + "epoch": 3.1646683867051144, + "grad_norm": 1.1253470182418823, + "learning_rate": 4.727143869596031e-05, + "loss": 0.8989, + "step": 10426 + }, + { + "epoch": 3.164971922901806, + "grad_norm": 0.8004952073097229, + "learning_rate": 4.7266376430090106e-05, + "loss": 0.8064, + "step": 10427 + }, + { + "epoch": 3.1652754590984973, + "grad_norm": 1.1187547445297241, + "learning_rate": 4.726131416421991e-05, + "loss": 0.8807, + "step": 10428 + }, + { + "epoch": 3.165578995295189, + "grad_norm": 1.0555150508880615, + "learning_rate": 4.72562518983497e-05, + "loss": 1.052, + "step": 10429 + }, + { + "epoch": 3.1658825314918806, + "grad_norm": 0.9098812937736511, + "learning_rate": 4.72511896324795e-05, + "loss": 1.0031, + "step": 10430 + }, + { + "epoch": 3.166186067688572, + "grad_norm": 1.0820943117141724, + "learning_rate": 4.724612736660929e-05, + "loss": 0.6131, + "step": 10431 + }, + { + "epoch": 3.1664896038852635, + "grad_norm": 1.120794653892517, + "learning_rate": 4.724106510073909e-05, + "loss": 0.701, + "step": 10432 + }, + { + "epoch": 3.1667931400819547, + "grad_norm": 0.8363889455795288, + "learning_rate": 4.7236002834868895e-05, + "loss": 1.2581, + "step": 10433 + }, + { + "epoch": 3.1670966762786463, + "grad_norm": 1.090535044670105, + "learning_rate": 4.723094056899869e-05, + "loss": 0.6682, + "step": 10434 + }, + { + "epoch": 3.1674002124753375, + "grad_norm": 0.944506049156189, + "learning_rate": 4.7225878303128485e-05, + "loss": 1.2196, + "step": 10435 + }, + { + "epoch": 3.167703748672029, + "grad_norm": 0.9995464086532593, + "learning_rate": 4.722081603725828e-05, + "loss": 1.0312, + "step": 10436 + }, + { + "epoch": 3.1680072848687204, + "grad_norm": 1.0534577369689941, + "learning_rate": 4.7215753771388075e-05, + "loss": 0.8441, + "step": 10437 + }, + { + "epoch": 3.168310821065412, + "grad_norm": 0.9389618635177612, + "learning_rate": 4.7210691505517876e-05, + "loss": 1.2046, + "step": 10438 + }, + { + "epoch": 3.1686143572621033, + "grad_norm": 1.1414529085159302, + "learning_rate": 4.720562923964767e-05, + "loss": 1.1394, + "step": 10439 + }, + { + "epoch": 3.168917893458795, + "grad_norm": 0.5881212949752808, + "learning_rate": 4.7200566973777466e-05, + "loss": 1.1014, + "step": 10440 + }, + { + "epoch": 3.1692214296554866, + "grad_norm": 0.8964493870735168, + "learning_rate": 4.719550470790726e-05, + "loss": 0.7681, + "step": 10441 + }, + { + "epoch": 3.169524965852178, + "grad_norm": 0.969706654548645, + "learning_rate": 4.7190442442037055e-05, + "loss": 1.3137, + "step": 10442 + }, + { + "epoch": 3.1698285020488695, + "grad_norm": 0.6851428747177124, + "learning_rate": 4.718538017616686e-05, + "loss": 0.8572, + "step": 10443 + }, + { + "epoch": 3.1701320382455607, + "grad_norm": 1.0048916339874268, + "learning_rate": 4.718031791029665e-05, + "loss": 0.9452, + "step": 10444 + }, + { + "epoch": 3.1704355744422523, + "grad_norm": 0.7481570839881897, + "learning_rate": 4.7175255644426446e-05, + "loss": 1.3663, + "step": 10445 + }, + { + "epoch": 3.1707391106389435, + "grad_norm": 0.8221026062965393, + "learning_rate": 4.717019337855624e-05, + "loss": 0.6139, + "step": 10446 + }, + { + "epoch": 3.171042646835635, + "grad_norm": 1.0897480249404907, + "learning_rate": 4.716513111268604e-05, + "loss": 0.6395, + "step": 10447 + }, + { + "epoch": 3.1713461830323264, + "grad_norm": 0.8052000403404236, + "learning_rate": 4.716006884681584e-05, + "loss": 1.2836, + "step": 10448 + }, + { + "epoch": 3.171649719229018, + "grad_norm": 1.1463459730148315, + "learning_rate": 4.715500658094563e-05, + "loss": 0.9871, + "step": 10449 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.812283992767334, + "learning_rate": 4.714994431507543e-05, + "loss": 0.7682, + "step": 10450 + }, + { + "epoch": 3.172256791622401, + "grad_norm": 0.995480477809906, + "learning_rate": 4.714488204920522e-05, + "loss": 1.0682, + "step": 10451 + }, + { + "epoch": 3.1725603278190926, + "grad_norm": 1.1444796323776245, + "learning_rate": 4.7139819783335024e-05, + "loss": 0.881, + "step": 10452 + }, + { + "epoch": 3.172863864015784, + "grad_norm": 1.1898192167282104, + "learning_rate": 4.713475751746482e-05, + "loss": 0.6664, + "step": 10453 + }, + { + "epoch": 3.1731674002124755, + "grad_norm": 0.9991341233253479, + "learning_rate": 4.712969525159461e-05, + "loss": 0.8315, + "step": 10454 + }, + { + "epoch": 3.1734709364091667, + "grad_norm": 1.060228943824768, + "learning_rate": 4.7124632985724415e-05, + "loss": 0.8037, + "step": 10455 + }, + { + "epoch": 3.1737744726058583, + "grad_norm": 0.849179744720459, + "learning_rate": 4.711957071985421e-05, + "loss": 1.3346, + "step": 10456 + }, + { + "epoch": 3.1740780088025495, + "grad_norm": 1.158487319946289, + "learning_rate": 4.7114508453984004e-05, + "loss": 0.7582, + "step": 10457 + }, + { + "epoch": 3.174381544999241, + "grad_norm": 1.0423033237457275, + "learning_rate": 4.7109446188113806e-05, + "loss": 1.3551, + "step": 10458 + }, + { + "epoch": 3.174685081195933, + "grad_norm": 1.0091899633407593, + "learning_rate": 4.71043839222436e-05, + "loss": 0.9523, + "step": 10459 + }, + { + "epoch": 3.174988617392624, + "grad_norm": 0.9938556551933289, + "learning_rate": 4.7099321656373395e-05, + "loss": 0.9335, + "step": 10460 + }, + { + "epoch": 3.1752921535893157, + "grad_norm": 1.2174179553985596, + "learning_rate": 4.709425939050319e-05, + "loss": 0.9723, + "step": 10461 + }, + { + "epoch": 3.175595689786007, + "grad_norm": 0.8016282320022583, + "learning_rate": 4.708919712463299e-05, + "loss": 0.8104, + "step": 10462 + }, + { + "epoch": 3.1758992259826986, + "grad_norm": 1.0852882862091064, + "learning_rate": 4.7084134858762787e-05, + "loss": 1.0255, + "step": 10463 + }, + { + "epoch": 3.17620276217939, + "grad_norm": 0.8602305054664612, + "learning_rate": 4.707907259289258e-05, + "loss": 0.5075, + "step": 10464 + }, + { + "epoch": 3.1765062983760814, + "grad_norm": 1.0599433183670044, + "learning_rate": 4.7074010327022376e-05, + "loss": 1.0491, + "step": 10465 + }, + { + "epoch": 3.1768098345727727, + "grad_norm": 0.9715973138809204, + "learning_rate": 4.706894806115217e-05, + "loss": 0.6836, + "step": 10466 + }, + { + "epoch": 3.1771133707694643, + "grad_norm": 0.8323686718940735, + "learning_rate": 4.706388579528197e-05, + "loss": 0.3895, + "step": 10467 + }, + { + "epoch": 3.1774169069661555, + "grad_norm": 1.0993438959121704, + "learning_rate": 4.705882352941177e-05, + "loss": 0.869, + "step": 10468 + }, + { + "epoch": 3.177720443162847, + "grad_norm": 1.0233573913574219, + "learning_rate": 4.705376126354156e-05, + "loss": 0.7826, + "step": 10469 + }, + { + "epoch": 3.178023979359539, + "grad_norm": 1.0313493013381958, + "learning_rate": 4.704869899767136e-05, + "loss": 0.5693, + "step": 10470 + }, + { + "epoch": 3.17832751555623, + "grad_norm": 1.3386132717132568, + "learning_rate": 4.704363673180115e-05, + "loss": 0.7943, + "step": 10471 + }, + { + "epoch": 3.1786310517529217, + "grad_norm": 0.9544557929039001, + "learning_rate": 4.703857446593095e-05, + "loss": 0.642, + "step": 10472 + }, + { + "epoch": 3.178934587949613, + "grad_norm": 1.1685975790023804, + "learning_rate": 4.703351220006075e-05, + "loss": 0.6335, + "step": 10473 + }, + { + "epoch": 3.1792381241463046, + "grad_norm": 1.04962158203125, + "learning_rate": 4.702844993419054e-05, + "loss": 1.1251, + "step": 10474 + }, + { + "epoch": 3.179541660342996, + "grad_norm": 0.8730882406234741, + "learning_rate": 4.702338766832034e-05, + "loss": 0.9079, + "step": 10475 + }, + { + "epoch": 3.1798451965396874, + "grad_norm": 0.9799057841300964, + "learning_rate": 4.701832540245014e-05, + "loss": 0.6009, + "step": 10476 + }, + { + "epoch": 3.1801487327363787, + "grad_norm": 0.8249173760414124, + "learning_rate": 4.701326313657994e-05, + "loss": 0.9953, + "step": 10477 + }, + { + "epoch": 3.1804522689330703, + "grad_norm": 0.8999932408332825, + "learning_rate": 4.7008200870709736e-05, + "loss": 1.11, + "step": 10478 + }, + { + "epoch": 3.1807558051297615, + "grad_norm": 1.1642708778381348, + "learning_rate": 4.700313860483953e-05, + "loss": 0.6894, + "step": 10479 + }, + { + "epoch": 3.181059341326453, + "grad_norm": 0.8566053509712219, + "learning_rate": 4.6998076338969325e-05, + "loss": 1.3424, + "step": 10480 + }, + { + "epoch": 3.181362877523145, + "grad_norm": 0.8875060677528381, + "learning_rate": 4.699301407309912e-05, + "loss": 0.8253, + "step": 10481 + }, + { + "epoch": 3.181666413719836, + "grad_norm": 0.9002801179885864, + "learning_rate": 4.698795180722892e-05, + "loss": 0.8687, + "step": 10482 + }, + { + "epoch": 3.1819699499165277, + "grad_norm": 0.9148478507995605, + "learning_rate": 4.6982889541358716e-05, + "loss": 0.9476, + "step": 10483 + }, + { + "epoch": 3.182273486113219, + "grad_norm": 0.8363088965415955, + "learning_rate": 4.697782727548851e-05, + "loss": 0.8666, + "step": 10484 + }, + { + "epoch": 3.1825770223099106, + "grad_norm": 0.9983605146408081, + "learning_rate": 4.6972765009618306e-05, + "loss": 0.9889, + "step": 10485 + }, + { + "epoch": 3.182880558506602, + "grad_norm": 1.1742757558822632, + "learning_rate": 4.696770274374811e-05, + "loss": 0.854, + "step": 10486 + }, + { + "epoch": 3.1831840947032934, + "grad_norm": 1.0945922136306763, + "learning_rate": 4.69626404778779e-05, + "loss": 1.0521, + "step": 10487 + }, + { + "epoch": 3.1834876308999847, + "grad_norm": 0.8791086077690125, + "learning_rate": 4.69575782120077e-05, + "loss": 1.1712, + "step": 10488 + }, + { + "epoch": 3.1837911670966763, + "grad_norm": 1.1557753086090088, + "learning_rate": 4.695251594613749e-05, + "loss": 0.9211, + "step": 10489 + }, + { + "epoch": 3.1840947032933675, + "grad_norm": 0.9488396048545837, + "learning_rate": 4.694745368026729e-05, + "loss": 1.5761, + "step": 10490 + }, + { + "epoch": 3.184398239490059, + "grad_norm": 0.9717015027999878, + "learning_rate": 4.694239141439709e-05, + "loss": 0.3868, + "step": 10491 + }, + { + "epoch": 3.184701775686751, + "grad_norm": 1.0204402208328247, + "learning_rate": 4.693732914852688e-05, + "loss": 0.5984, + "step": 10492 + }, + { + "epoch": 3.185005311883442, + "grad_norm": 0.7851073741912842, + "learning_rate": 4.693226688265668e-05, + "loss": 0.4776, + "step": 10493 + }, + { + "epoch": 3.1853088480801337, + "grad_norm": 0.9176906943321228, + "learning_rate": 4.692720461678647e-05, + "loss": 1.0813, + "step": 10494 + }, + { + "epoch": 3.185612384276825, + "grad_norm": 0.972144603729248, + "learning_rate": 4.692214235091627e-05, + "loss": 1.4463, + "step": 10495 + }, + { + "epoch": 3.1859159204735166, + "grad_norm": 1.193623423576355, + "learning_rate": 4.691708008504607e-05, + "loss": 0.9254, + "step": 10496 + }, + { + "epoch": 3.186219456670208, + "grad_norm": 1.2785239219665527, + "learning_rate": 4.6912017819175864e-05, + "loss": 0.9353, + "step": 10497 + }, + { + "epoch": 3.1865229928668994, + "grad_norm": 1.1432338953018188, + "learning_rate": 4.690695555330566e-05, + "loss": 0.9695, + "step": 10498 + }, + { + "epoch": 3.1868265290635907, + "grad_norm": 0.9062445163726807, + "learning_rate": 4.690189328743546e-05, + "loss": 0.9411, + "step": 10499 + }, + { + "epoch": 3.1871300652602823, + "grad_norm": 0.7954709529876709, + "learning_rate": 4.6896831021565255e-05, + "loss": 0.7768, + "step": 10500 + }, + { + "epoch": 3.1874336014569735, + "grad_norm": 1.1647448539733887, + "learning_rate": 4.6891768755695057e-05, + "loss": 0.823, + "step": 10501 + }, + { + "epoch": 3.187737137653665, + "grad_norm": 0.8608177304267883, + "learning_rate": 4.688670648982485e-05, + "loss": 0.6826, + "step": 10502 + }, + { + "epoch": 3.188040673850357, + "grad_norm": 0.8997310996055603, + "learning_rate": 4.6881644223954646e-05, + "loss": 1.3326, + "step": 10503 + }, + { + "epoch": 3.188344210047048, + "grad_norm": 1.141579270362854, + "learning_rate": 4.687658195808444e-05, + "loss": 0.4784, + "step": 10504 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 1.0511449575424194, + "learning_rate": 4.6871519692214236e-05, + "loss": 0.8703, + "step": 10505 + }, + { + "epoch": 3.188951282440431, + "grad_norm": 1.0179569721221924, + "learning_rate": 4.686645742634404e-05, + "loss": 1.1936, + "step": 10506 + }, + { + "epoch": 3.1892548186371226, + "grad_norm": 0.9490963220596313, + "learning_rate": 4.686139516047383e-05, + "loss": 0.9454, + "step": 10507 + }, + { + "epoch": 3.189558354833814, + "grad_norm": 0.8751633763313293, + "learning_rate": 4.685633289460363e-05, + "loss": 0.85, + "step": 10508 + }, + { + "epoch": 3.1898618910305054, + "grad_norm": 1.207231879234314, + "learning_rate": 4.685127062873342e-05, + "loss": 0.9717, + "step": 10509 + }, + { + "epoch": 3.1901654272271966, + "grad_norm": 0.9859233498573303, + "learning_rate": 4.6846208362863216e-05, + "loss": 1.0187, + "step": 10510 + }, + { + "epoch": 3.1904689634238883, + "grad_norm": 1.0566620826721191, + "learning_rate": 4.684114609699302e-05, + "loss": 1.2231, + "step": 10511 + }, + { + "epoch": 3.19077249962058, + "grad_norm": 0.9471179246902466, + "learning_rate": 4.683608383112281e-05, + "loss": 0.993, + "step": 10512 + }, + { + "epoch": 3.191076035817271, + "grad_norm": 1.0835156440734863, + "learning_rate": 4.683102156525261e-05, + "loss": 0.9046, + "step": 10513 + }, + { + "epoch": 3.191379572013963, + "grad_norm": 0.8515180349349976, + "learning_rate": 4.68259592993824e-05, + "loss": 1.3276, + "step": 10514 + }, + { + "epoch": 3.191683108210654, + "grad_norm": 1.0197093486785889, + "learning_rate": 4.6820897033512204e-05, + "loss": 1.0541, + "step": 10515 + }, + { + "epoch": 3.1919866444073457, + "grad_norm": 0.9375689029693604, + "learning_rate": 4.6815834767642e-05, + "loss": 0.6162, + "step": 10516 + }, + { + "epoch": 3.192290180604037, + "grad_norm": 0.731779158115387, + "learning_rate": 4.6810772501771794e-05, + "loss": 0.4171, + "step": 10517 + }, + { + "epoch": 3.1925937168007286, + "grad_norm": 1.058833122253418, + "learning_rate": 4.680571023590159e-05, + "loss": 1.2511, + "step": 10518 + }, + { + "epoch": 3.1928972529974198, + "grad_norm": 1.031198501586914, + "learning_rate": 4.680064797003138e-05, + "loss": 0.7109, + "step": 10519 + }, + { + "epoch": 3.1932007891941114, + "grad_norm": 1.0947314500808716, + "learning_rate": 4.6795585704161185e-05, + "loss": 0.5742, + "step": 10520 + }, + { + "epoch": 3.193504325390803, + "grad_norm": 1.1002849340438843, + "learning_rate": 4.6790523438290986e-05, + "loss": 1.0004, + "step": 10521 + }, + { + "epoch": 3.1938078615874943, + "grad_norm": 1.1321697235107422, + "learning_rate": 4.678546117242078e-05, + "loss": 0.6989, + "step": 10522 + }, + { + "epoch": 3.194111397784186, + "grad_norm": 1.0674326419830322, + "learning_rate": 4.6780398906550576e-05, + "loss": 0.6236, + "step": 10523 + }, + { + "epoch": 3.194414933980877, + "grad_norm": 1.0042221546173096, + "learning_rate": 4.677533664068037e-05, + "loss": 1.0788, + "step": 10524 + }, + { + "epoch": 3.194718470177569, + "grad_norm": 0.9673066735267639, + "learning_rate": 4.677027437481017e-05, + "loss": 1.1955, + "step": 10525 + }, + { + "epoch": 3.19502200637426, + "grad_norm": 0.9858225584030151, + "learning_rate": 4.676521210893997e-05, + "loss": 0.9093, + "step": 10526 + }, + { + "epoch": 3.1953255425709517, + "grad_norm": 0.8357099890708923, + "learning_rate": 4.676014984306976e-05, + "loss": 0.7842, + "step": 10527 + }, + { + "epoch": 3.195629078767643, + "grad_norm": 0.8749469518661499, + "learning_rate": 4.675508757719956e-05, + "loss": 0.7905, + "step": 10528 + }, + { + "epoch": 3.1959326149643346, + "grad_norm": 0.9615261554718018, + "learning_rate": 4.675002531132935e-05, + "loss": 1.1927, + "step": 10529 + }, + { + "epoch": 3.1962361511610258, + "grad_norm": 0.7049397826194763, + "learning_rate": 4.674496304545915e-05, + "loss": 1.0481, + "step": 10530 + }, + { + "epoch": 3.1965396873577174, + "grad_norm": 0.924734354019165, + "learning_rate": 4.673990077958895e-05, + "loss": 1.0658, + "step": 10531 + }, + { + "epoch": 3.196843223554409, + "grad_norm": 1.0186694860458374, + "learning_rate": 4.673483851371874e-05, + "loss": 0.649, + "step": 10532 + }, + { + "epoch": 3.1971467597511003, + "grad_norm": 0.6679419279098511, + "learning_rate": 4.672977624784854e-05, + "loss": 1.1072, + "step": 10533 + }, + { + "epoch": 3.197450295947792, + "grad_norm": 1.091610312461853, + "learning_rate": 4.672471398197833e-05, + "loss": 0.9778, + "step": 10534 + }, + { + "epoch": 3.197753832144483, + "grad_norm": 0.7828901410102844, + "learning_rate": 4.6719651716108134e-05, + "loss": 0.6967, + "step": 10535 + }, + { + "epoch": 3.198057368341175, + "grad_norm": 0.9334644675254822, + "learning_rate": 4.671458945023793e-05, + "loss": 0.4858, + "step": 10536 + }, + { + "epoch": 3.198360904537866, + "grad_norm": 0.9903780817985535, + "learning_rate": 4.670952718436772e-05, + "loss": 1.1473, + "step": 10537 + }, + { + "epoch": 3.1986644407345577, + "grad_norm": 0.938765287399292, + "learning_rate": 4.670446491849752e-05, + "loss": 0.7979, + "step": 10538 + }, + { + "epoch": 3.198967976931249, + "grad_norm": 0.8678938150405884, + "learning_rate": 4.669940265262732e-05, + "loss": 0.8317, + "step": 10539 + }, + { + "epoch": 3.1992715131279406, + "grad_norm": 1.1614018678665161, + "learning_rate": 4.6694340386757115e-05, + "loss": 0.6367, + "step": 10540 + }, + { + "epoch": 3.1995750493246318, + "grad_norm": 0.7502949237823486, + "learning_rate": 4.668927812088691e-05, + "loss": 0.8243, + "step": 10541 + }, + { + "epoch": 3.1998785855213234, + "grad_norm": 1.0190019607543945, + "learning_rate": 4.6684215855016704e-05, + "loss": 0.8717, + "step": 10542 + }, + { + "epoch": 3.200182121718015, + "grad_norm": 0.8147711753845215, + "learning_rate": 4.66791535891465e-05, + "loss": 0.2395, + "step": 10543 + }, + { + "epoch": 3.2004856579147063, + "grad_norm": 1.1169768571853638, + "learning_rate": 4.66740913232763e-05, + "loss": 0.6352, + "step": 10544 + }, + { + "epoch": 3.200789194111398, + "grad_norm": 0.7559905648231506, + "learning_rate": 4.66690290574061e-05, + "loss": 0.8913, + "step": 10545 + }, + { + "epoch": 3.201092730308089, + "grad_norm": 0.904870331287384, + "learning_rate": 4.66639667915359e-05, + "loss": 0.8975, + "step": 10546 + }, + { + "epoch": 3.201396266504781, + "grad_norm": 0.8091874122619629, + "learning_rate": 4.665890452566569e-05, + "loss": 1.2952, + "step": 10547 + }, + { + "epoch": 3.201699802701472, + "grad_norm": 1.0972801446914673, + "learning_rate": 4.6653842259795486e-05, + "loss": 0.7569, + "step": 10548 + }, + { + "epoch": 3.2020033388981637, + "grad_norm": 1.1147466897964478, + "learning_rate": 4.664877999392528e-05, + "loss": 0.9802, + "step": 10549 + }, + { + "epoch": 3.202306875094855, + "grad_norm": 1.1943230628967285, + "learning_rate": 4.664371772805508e-05, + "loss": 0.7045, + "step": 10550 + }, + { + "epoch": 3.2026104112915466, + "grad_norm": 1.2622238397598267, + "learning_rate": 4.663865546218488e-05, + "loss": 0.8173, + "step": 10551 + }, + { + "epoch": 3.2029139474882378, + "grad_norm": 1.2396291494369507, + "learning_rate": 4.663359319631467e-05, + "loss": 0.68, + "step": 10552 + }, + { + "epoch": 3.2032174836849294, + "grad_norm": 1.1432640552520752, + "learning_rate": 4.662853093044447e-05, + "loss": 0.6573, + "step": 10553 + }, + { + "epoch": 3.203521019881621, + "grad_norm": 1.17825186252594, + "learning_rate": 4.662346866457427e-05, + "loss": 0.9158, + "step": 10554 + }, + { + "epoch": 3.2038245560783123, + "grad_norm": 1.0937422513961792, + "learning_rate": 4.6618406398704064e-05, + "loss": 0.9735, + "step": 10555 + }, + { + "epoch": 3.204128092275004, + "grad_norm": 1.0007274150848389, + "learning_rate": 4.661334413283386e-05, + "loss": 0.704, + "step": 10556 + }, + { + "epoch": 3.204431628471695, + "grad_norm": 1.0235059261322021, + "learning_rate": 4.660828186696365e-05, + "loss": 0.7544, + "step": 10557 + }, + { + "epoch": 3.204735164668387, + "grad_norm": 1.0718505382537842, + "learning_rate": 4.660321960109345e-05, + "loss": 0.5919, + "step": 10558 + }, + { + "epoch": 3.205038700865078, + "grad_norm": 0.90024733543396, + "learning_rate": 4.659815733522325e-05, + "loss": 0.7822, + "step": 10559 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 1.0417277812957764, + "learning_rate": 4.6593095069353044e-05, + "loss": 0.7518, + "step": 10560 + }, + { + "epoch": 3.205645773258461, + "grad_norm": 1.0350341796875, + "learning_rate": 4.658803280348284e-05, + "loss": 0.9891, + "step": 10561 + }, + { + "epoch": 3.2059493094551526, + "grad_norm": 0.9386329054832458, + "learning_rate": 4.6582970537612634e-05, + "loss": 1.0598, + "step": 10562 + }, + { + "epoch": 3.2062528456518438, + "grad_norm": 0.8749595284461975, + "learning_rate": 4.657790827174243e-05, + "loss": 1.0382, + "step": 10563 + }, + { + "epoch": 3.2065563818485354, + "grad_norm": 1.1091443300247192, + "learning_rate": 4.657284600587223e-05, + "loss": 0.6033, + "step": 10564 + }, + { + "epoch": 3.206859918045227, + "grad_norm": 1.025846004486084, + "learning_rate": 4.6567783740002025e-05, + "loss": 0.8092, + "step": 10565 + }, + { + "epoch": 3.2071634542419183, + "grad_norm": 1.1258829832077026, + "learning_rate": 4.656272147413183e-05, + "loss": 0.7107, + "step": 10566 + }, + { + "epoch": 3.20746699043861, + "grad_norm": 0.9417251944541931, + "learning_rate": 4.655765920826162e-05, + "loss": 0.7569, + "step": 10567 + }, + { + "epoch": 3.207770526635301, + "grad_norm": 1.0193554162979126, + "learning_rate": 4.6552596942391416e-05, + "loss": 0.5618, + "step": 10568 + }, + { + "epoch": 3.208074062831993, + "grad_norm": 1.0344990491867065, + "learning_rate": 4.654753467652122e-05, + "loss": 0.9427, + "step": 10569 + }, + { + "epoch": 3.208377599028684, + "grad_norm": 1.038217306137085, + "learning_rate": 4.654247241065101e-05, + "loss": 0.8161, + "step": 10570 + }, + { + "epoch": 3.2086811352253757, + "grad_norm": 1.097551941871643, + "learning_rate": 4.653741014478081e-05, + "loss": 0.8841, + "step": 10571 + }, + { + "epoch": 3.208984671422067, + "grad_norm": 0.9986087679862976, + "learning_rate": 4.65323478789106e-05, + "loss": 0.9999, + "step": 10572 + }, + { + "epoch": 3.2092882076187585, + "grad_norm": 0.9904304146766663, + "learning_rate": 4.65272856130404e-05, + "loss": 1.2553, + "step": 10573 + }, + { + "epoch": 3.2095917438154498, + "grad_norm": 0.8540041446685791, + "learning_rate": 4.65222233471702e-05, + "loss": 0.7787, + "step": 10574 + }, + { + "epoch": 3.2098952800121414, + "grad_norm": 0.9092767834663391, + "learning_rate": 4.651716108129999e-05, + "loss": 0.6381, + "step": 10575 + }, + { + "epoch": 3.210198816208833, + "grad_norm": 1.2394921779632568, + "learning_rate": 4.651209881542979e-05, + "loss": 0.6842, + "step": 10576 + }, + { + "epoch": 3.2105023524055243, + "grad_norm": 1.1485285758972168, + "learning_rate": 4.650703654955958e-05, + "loss": 0.6583, + "step": 10577 + }, + { + "epoch": 3.210805888602216, + "grad_norm": 1.1151689291000366, + "learning_rate": 4.6501974283689384e-05, + "loss": 0.7781, + "step": 10578 + }, + { + "epoch": 3.211109424798907, + "grad_norm": 1.0521682500839233, + "learning_rate": 4.649691201781918e-05, + "loss": 0.7147, + "step": 10579 + }, + { + "epoch": 3.211412960995599, + "grad_norm": 1.031783938407898, + "learning_rate": 4.6491849751948974e-05, + "loss": 1.0621, + "step": 10580 + }, + { + "epoch": 3.21171649719229, + "grad_norm": 1.155425786972046, + "learning_rate": 4.648678748607877e-05, + "loss": 0.964, + "step": 10581 + }, + { + "epoch": 3.2120200333889817, + "grad_norm": 1.1283220052719116, + "learning_rate": 4.6481725220208564e-05, + "loss": 0.4224, + "step": 10582 + }, + { + "epoch": 3.212323569585673, + "grad_norm": 0.9688919186592102, + "learning_rate": 4.6476662954338365e-05, + "loss": 0.8884, + "step": 10583 + }, + { + "epoch": 3.2126271057823645, + "grad_norm": 1.097516417503357, + "learning_rate": 4.647160068846816e-05, + "loss": 0.8736, + "step": 10584 + }, + { + "epoch": 3.212930641979056, + "grad_norm": 1.0238796472549438, + "learning_rate": 4.6466538422597955e-05, + "loss": 0.8874, + "step": 10585 + }, + { + "epoch": 3.2132341781757474, + "grad_norm": 1.156239628791809, + "learning_rate": 4.646147615672775e-05, + "loss": 0.9518, + "step": 10586 + }, + { + "epoch": 3.213537714372439, + "grad_norm": 1.1334460973739624, + "learning_rate": 4.6456413890857544e-05, + "loss": 1.1138, + "step": 10587 + }, + { + "epoch": 3.2138412505691303, + "grad_norm": 1.1904267072677612, + "learning_rate": 4.6451351624987346e-05, + "loss": 0.6535, + "step": 10588 + }, + { + "epoch": 3.214144786765822, + "grad_norm": 1.2623374462127686, + "learning_rate": 4.644628935911715e-05, + "loss": 0.7739, + "step": 10589 + }, + { + "epoch": 3.214448322962513, + "grad_norm": 1.237666368484497, + "learning_rate": 4.644122709324694e-05, + "loss": 1.1083, + "step": 10590 + }, + { + "epoch": 3.214751859159205, + "grad_norm": 0.9218225479125977, + "learning_rate": 4.643616482737674e-05, + "loss": 0.7477, + "step": 10591 + }, + { + "epoch": 3.215055395355896, + "grad_norm": 0.8881485462188721, + "learning_rate": 4.643110256150653e-05, + "loss": 0.679, + "step": 10592 + }, + { + "epoch": 3.2153589315525877, + "grad_norm": 1.1710610389709473, + "learning_rate": 4.6426040295636334e-05, + "loss": 0.6418, + "step": 10593 + }, + { + "epoch": 3.2156624677492793, + "grad_norm": 1.102946162223816, + "learning_rate": 4.642097802976613e-05, + "loss": 0.9151, + "step": 10594 + }, + { + "epoch": 3.2159660039459705, + "grad_norm": 0.8424364924430847, + "learning_rate": 4.641591576389592e-05, + "loss": 0.7412, + "step": 10595 + }, + { + "epoch": 3.216269540142662, + "grad_norm": 1.0896893739700317, + "learning_rate": 4.641085349802572e-05, + "loss": 0.6239, + "step": 10596 + }, + { + "epoch": 3.2165730763393534, + "grad_norm": 0.9789384603500366, + "learning_rate": 4.640579123215551e-05, + "loss": 0.7869, + "step": 10597 + }, + { + "epoch": 3.216876612536045, + "grad_norm": 1.0091615915298462, + "learning_rate": 4.6400728966285314e-05, + "loss": 1.2307, + "step": 10598 + }, + { + "epoch": 3.2171801487327363, + "grad_norm": 1.25413179397583, + "learning_rate": 4.639566670041511e-05, + "loss": 0.9175, + "step": 10599 + }, + { + "epoch": 3.217483684929428, + "grad_norm": 0.9582463502883911, + "learning_rate": 4.6390604434544904e-05, + "loss": 1.1541, + "step": 10600 + }, + { + "epoch": 3.217787221126119, + "grad_norm": 1.05919349193573, + "learning_rate": 4.63855421686747e-05, + "loss": 0.4802, + "step": 10601 + }, + { + "epoch": 3.218090757322811, + "grad_norm": 0.8946004509925842, + "learning_rate": 4.6380479902804493e-05, + "loss": 1.4509, + "step": 10602 + }, + { + "epoch": 3.218394293519502, + "grad_norm": 1.2026264667510986, + "learning_rate": 4.6375417636934295e-05, + "loss": 0.7478, + "step": 10603 + }, + { + "epoch": 3.2186978297161937, + "grad_norm": 1.112428069114685, + "learning_rate": 4.637035537106409e-05, + "loss": 0.9188, + "step": 10604 + }, + { + "epoch": 3.2190013659128853, + "grad_norm": 1.2455602884292603, + "learning_rate": 4.6365293105193885e-05, + "loss": 0.8507, + "step": 10605 + }, + { + "epoch": 3.2193049021095765, + "grad_norm": 1.1607472896575928, + "learning_rate": 4.636023083932368e-05, + "loss": 1.0462, + "step": 10606 + }, + { + "epoch": 3.219608438306268, + "grad_norm": 1.044325828552246, + "learning_rate": 4.635516857345348e-05, + "loss": 0.9742, + "step": 10607 + }, + { + "epoch": 3.2199119745029594, + "grad_norm": 0.880424439907074, + "learning_rate": 4.6350106307583276e-05, + "loss": 1.1564, + "step": 10608 + }, + { + "epoch": 3.220215510699651, + "grad_norm": 1.0218632221221924, + "learning_rate": 4.634504404171307e-05, + "loss": 0.6782, + "step": 10609 + }, + { + "epoch": 3.2205190468963423, + "grad_norm": 1.1731562614440918, + "learning_rate": 4.633998177584287e-05, + "loss": 0.8932, + "step": 10610 + }, + { + "epoch": 3.220822583093034, + "grad_norm": 1.1994465589523315, + "learning_rate": 4.633491950997267e-05, + "loss": 1.1647, + "step": 10611 + }, + { + "epoch": 3.221126119289725, + "grad_norm": 1.2340362071990967, + "learning_rate": 4.632985724410246e-05, + "loss": 0.7743, + "step": 10612 + }, + { + "epoch": 3.221429655486417, + "grad_norm": 0.989773690700531, + "learning_rate": 4.632479497823226e-05, + "loss": 1.3536, + "step": 10613 + }, + { + "epoch": 3.221733191683108, + "grad_norm": 0.941053032875061, + "learning_rate": 4.631973271236206e-05, + "loss": 0.8544, + "step": 10614 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 1.07404625415802, + "learning_rate": 4.631467044649185e-05, + "loss": 0.847, + "step": 10615 + }, + { + "epoch": 3.2223402640764913, + "grad_norm": 1.12881600856781, + "learning_rate": 4.630960818062165e-05, + "loss": 1.0886, + "step": 10616 + }, + { + "epoch": 3.2226438002731825, + "grad_norm": 1.0794014930725098, + "learning_rate": 4.630454591475145e-05, + "loss": 1.0087, + "step": 10617 + }, + { + "epoch": 3.222947336469874, + "grad_norm": 1.13034987449646, + "learning_rate": 4.6299483648881244e-05, + "loss": 0.7996, + "step": 10618 + }, + { + "epoch": 3.2232508726665654, + "grad_norm": 1.0205479860305786, + "learning_rate": 4.629442138301104e-05, + "loss": 1.4104, + "step": 10619 + }, + { + "epoch": 3.223554408863257, + "grad_norm": 0.9251573085784912, + "learning_rate": 4.6289359117140834e-05, + "loss": 0.9119, + "step": 10620 + }, + { + "epoch": 3.2238579450599483, + "grad_norm": 1.0073034763336182, + "learning_rate": 4.628429685127063e-05, + "loss": 0.8946, + "step": 10621 + }, + { + "epoch": 3.22416148125664, + "grad_norm": 1.1060259342193604, + "learning_rate": 4.627923458540043e-05, + "loss": 0.7775, + "step": 10622 + }, + { + "epoch": 3.224465017453331, + "grad_norm": 1.1062958240509033, + "learning_rate": 4.6274172319530225e-05, + "loss": 1.0847, + "step": 10623 + }, + { + "epoch": 3.224768553650023, + "grad_norm": 1.1014045476913452, + "learning_rate": 4.626911005366002e-05, + "loss": 1.0606, + "step": 10624 + }, + { + "epoch": 3.225072089846714, + "grad_norm": 1.0348385572433472, + "learning_rate": 4.6264047787789814e-05, + "loss": 1.0541, + "step": 10625 + }, + { + "epoch": 3.2253756260434057, + "grad_norm": 0.9178497791290283, + "learning_rate": 4.625898552191961e-05, + "loss": 0.935, + "step": 10626 + }, + { + "epoch": 3.2256791622400973, + "grad_norm": 0.9519029259681702, + "learning_rate": 4.625392325604941e-05, + "loss": 0.5853, + "step": 10627 + }, + { + "epoch": 3.2259826984367885, + "grad_norm": 0.8825869560241699, + "learning_rate": 4.6248860990179206e-05, + "loss": 0.3681, + "step": 10628 + }, + { + "epoch": 3.22628623463348, + "grad_norm": 1.056599736213684, + "learning_rate": 4.6243798724309e-05, + "loss": 0.4295, + "step": 10629 + }, + { + "epoch": 3.2265897708301714, + "grad_norm": 1.0902197360992432, + "learning_rate": 4.6238736458438795e-05, + "loss": 1.042, + "step": 10630 + }, + { + "epoch": 3.226893307026863, + "grad_norm": 0.8041782379150391, + "learning_rate": 4.62336741925686e-05, + "loss": 1.1872, + "step": 10631 + }, + { + "epoch": 3.2271968432235543, + "grad_norm": 0.9269036650657654, + "learning_rate": 4.622861192669839e-05, + "loss": 0.9548, + "step": 10632 + }, + { + "epoch": 3.227500379420246, + "grad_norm": 1.074905514717102, + "learning_rate": 4.622354966082819e-05, + "loss": 0.9435, + "step": 10633 + }, + { + "epoch": 3.227803915616937, + "grad_norm": 1.096496820449829, + "learning_rate": 4.621848739495799e-05, + "loss": 1.0128, + "step": 10634 + }, + { + "epoch": 3.228107451813629, + "grad_norm": 0.8798861503601074, + "learning_rate": 4.621342512908778e-05, + "loss": 1.2336, + "step": 10635 + }, + { + "epoch": 3.22841098801032, + "grad_norm": 1.3412737846374512, + "learning_rate": 4.620836286321758e-05, + "loss": 0.7272, + "step": 10636 + }, + { + "epoch": 3.2287145242070117, + "grad_norm": 1.2291632890701294, + "learning_rate": 4.620330059734738e-05, + "loss": 0.9837, + "step": 10637 + }, + { + "epoch": 3.2290180604037033, + "grad_norm": 1.0122969150543213, + "learning_rate": 4.6198238331477174e-05, + "loss": 1.0514, + "step": 10638 + }, + { + "epoch": 3.2293215966003945, + "grad_norm": 0.8240249156951904, + "learning_rate": 4.619317606560697e-05, + "loss": 0.6986, + "step": 10639 + }, + { + "epoch": 3.229625132797086, + "grad_norm": 1.0882290601730347, + "learning_rate": 4.6188113799736763e-05, + "loss": 1.0624, + "step": 10640 + }, + { + "epoch": 3.2299286689937774, + "grad_norm": 0.9558147192001343, + "learning_rate": 4.618305153386656e-05, + "loss": 0.7102, + "step": 10641 + }, + { + "epoch": 3.230232205190469, + "grad_norm": 0.9820065498352051, + "learning_rate": 4.617798926799636e-05, + "loss": 0.6857, + "step": 10642 + }, + { + "epoch": 3.2305357413871603, + "grad_norm": 1.1940356492996216, + "learning_rate": 4.6172927002126155e-05, + "loss": 0.9741, + "step": 10643 + }, + { + "epoch": 3.230839277583852, + "grad_norm": 1.141107201576233, + "learning_rate": 4.616786473625595e-05, + "loss": 0.8379, + "step": 10644 + }, + { + "epoch": 3.231142813780543, + "grad_norm": 0.9142043590545654, + "learning_rate": 4.6162802470385744e-05, + "loss": 0.8696, + "step": 10645 + }, + { + "epoch": 3.231446349977235, + "grad_norm": 1.134381890296936, + "learning_rate": 4.6157740204515546e-05, + "loss": 0.771, + "step": 10646 + }, + { + "epoch": 3.2317498861739264, + "grad_norm": 1.0686992406845093, + "learning_rate": 4.615267793864534e-05, + "loss": 0.5888, + "step": 10647 + }, + { + "epoch": 3.2320534223706177, + "grad_norm": 0.9754731059074402, + "learning_rate": 4.6147615672775135e-05, + "loss": 1.0414, + "step": 10648 + }, + { + "epoch": 3.2323569585673093, + "grad_norm": 0.9926722049713135, + "learning_rate": 4.614255340690493e-05, + "loss": 1.0837, + "step": 10649 + }, + { + "epoch": 3.2326604947640005, + "grad_norm": 0.9956318140029907, + "learning_rate": 4.6137491141034725e-05, + "loss": 0.568, + "step": 10650 + }, + { + "epoch": 3.232964030960692, + "grad_norm": 0.8912085890769958, + "learning_rate": 4.6132428875164526e-05, + "loss": 1.0641, + "step": 10651 + }, + { + "epoch": 3.2332675671573834, + "grad_norm": 0.8449482917785645, + "learning_rate": 4.612736660929432e-05, + "loss": 0.8797, + "step": 10652 + }, + { + "epoch": 3.233571103354075, + "grad_norm": 0.7308463454246521, + "learning_rate": 4.6122304343424116e-05, + "loss": 0.5446, + "step": 10653 + }, + { + "epoch": 3.2338746395507663, + "grad_norm": 0.9520199298858643, + "learning_rate": 4.611724207755391e-05, + "loss": 1.3735, + "step": 10654 + }, + { + "epoch": 3.234178175747458, + "grad_norm": 0.8654726147651672, + "learning_rate": 4.611217981168371e-05, + "loss": 0.9184, + "step": 10655 + }, + { + "epoch": 3.2344817119441496, + "grad_norm": 1.247514009475708, + "learning_rate": 4.6107117545813514e-05, + "loss": 0.9201, + "step": 10656 + }, + { + "epoch": 3.234785248140841, + "grad_norm": 0.9561178684234619, + "learning_rate": 4.610205527994331e-05, + "loss": 1.2475, + "step": 10657 + }, + { + "epoch": 3.2350887843375324, + "grad_norm": 1.004738450050354, + "learning_rate": 4.6096993014073104e-05, + "loss": 0.9635, + "step": 10658 + }, + { + "epoch": 3.2353923205342237, + "grad_norm": 1.1536376476287842, + "learning_rate": 4.60919307482029e-05, + "loss": 0.8195, + "step": 10659 + }, + { + "epoch": 3.2356958567309153, + "grad_norm": 0.7363036870956421, + "learning_rate": 4.608686848233269e-05, + "loss": 1.0029, + "step": 10660 + }, + { + "epoch": 3.2359993929276065, + "grad_norm": 0.7070170044898987, + "learning_rate": 4.6081806216462495e-05, + "loss": 0.6415, + "step": 10661 + }, + { + "epoch": 3.236302929124298, + "grad_norm": 0.8688763976097107, + "learning_rate": 4.607674395059229e-05, + "loss": 0.8557, + "step": 10662 + }, + { + "epoch": 3.2366064653209894, + "grad_norm": 1.031697154045105, + "learning_rate": 4.6071681684722084e-05, + "loss": 1.1908, + "step": 10663 + }, + { + "epoch": 3.236910001517681, + "grad_norm": 1.0898487567901611, + "learning_rate": 4.606661941885188e-05, + "loss": 0.7292, + "step": 10664 + }, + { + "epoch": 3.2372135377143723, + "grad_norm": 1.0735975503921509, + "learning_rate": 4.6061557152981674e-05, + "loss": 0.8378, + "step": 10665 + }, + { + "epoch": 3.237517073911064, + "grad_norm": 1.2165467739105225, + "learning_rate": 4.6056494887111475e-05, + "loss": 0.8289, + "step": 10666 + }, + { + "epoch": 3.2378206101077556, + "grad_norm": 1.191837191581726, + "learning_rate": 4.605143262124127e-05, + "loss": 0.707, + "step": 10667 + }, + { + "epoch": 3.238124146304447, + "grad_norm": 1.1953022480010986, + "learning_rate": 4.6046370355371065e-05, + "loss": 0.6559, + "step": 10668 + }, + { + "epoch": 3.2384276825011384, + "grad_norm": 1.1249115467071533, + "learning_rate": 4.604130808950086e-05, + "loss": 0.7527, + "step": 10669 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.9275970458984375, + "learning_rate": 4.603624582363066e-05, + "loss": 0.5327, + "step": 10670 + }, + { + "epoch": 3.2390347548945213, + "grad_norm": 0.912882387638092, + "learning_rate": 4.6031183557760456e-05, + "loss": 1.2058, + "step": 10671 + }, + { + "epoch": 3.2393382910912125, + "grad_norm": 1.0001786947250366, + "learning_rate": 4.602612129189025e-05, + "loss": 1.1827, + "step": 10672 + }, + { + "epoch": 3.239641827287904, + "grad_norm": 1.056784749031067, + "learning_rate": 4.6021059026020046e-05, + "loss": 0.8834, + "step": 10673 + }, + { + "epoch": 3.2399453634845954, + "grad_norm": 1.0593361854553223, + "learning_rate": 4.601599676014984e-05, + "loss": 0.8741, + "step": 10674 + }, + { + "epoch": 3.240248899681287, + "grad_norm": 1.2201656103134155, + "learning_rate": 4.601093449427964e-05, + "loss": 0.7956, + "step": 10675 + }, + { + "epoch": 3.2405524358779783, + "grad_norm": 0.9234567284584045, + "learning_rate": 4.600587222840944e-05, + "loss": 0.8796, + "step": 10676 + }, + { + "epoch": 3.24085597207467, + "grad_norm": 1.0139257907867432, + "learning_rate": 4.600080996253924e-05, + "loss": 1.4306, + "step": 10677 + }, + { + "epoch": 3.2411595082713616, + "grad_norm": 1.093319296836853, + "learning_rate": 4.599574769666903e-05, + "loss": 0.945, + "step": 10678 + }, + { + "epoch": 3.241463044468053, + "grad_norm": 1.1480295658111572, + "learning_rate": 4.599068543079883e-05, + "loss": 0.7341, + "step": 10679 + }, + { + "epoch": 3.2417665806647444, + "grad_norm": 1.101312279701233, + "learning_rate": 4.598562316492862e-05, + "loss": 0.7838, + "step": 10680 + }, + { + "epoch": 3.2420701168614356, + "grad_norm": 0.9698290824890137, + "learning_rate": 4.5980560899058425e-05, + "loss": 0.9298, + "step": 10681 + }, + { + "epoch": 3.2423736530581273, + "grad_norm": 1.0011426210403442, + "learning_rate": 4.597549863318822e-05, + "loss": 0.4071, + "step": 10682 + }, + { + "epoch": 3.2426771892548185, + "grad_norm": 0.8187726140022278, + "learning_rate": 4.5970436367318014e-05, + "loss": 1.1892, + "step": 10683 + }, + { + "epoch": 3.24298072545151, + "grad_norm": 0.977440595626831, + "learning_rate": 4.596537410144781e-05, + "loss": 1.2906, + "step": 10684 + }, + { + "epoch": 3.2432842616482014, + "grad_norm": 1.0147929191589355, + "learning_rate": 4.596031183557761e-05, + "loss": 0.7205, + "step": 10685 + }, + { + "epoch": 3.243587797844893, + "grad_norm": 0.736835241317749, + "learning_rate": 4.5955249569707405e-05, + "loss": 0.3608, + "step": 10686 + }, + { + "epoch": 3.2438913340415843, + "grad_norm": 0.865317165851593, + "learning_rate": 4.59501873038372e-05, + "loss": 1.3695, + "step": 10687 + }, + { + "epoch": 3.244194870238276, + "grad_norm": 0.882211446762085, + "learning_rate": 4.5945125037966995e-05, + "loss": 0.766, + "step": 10688 + }, + { + "epoch": 3.2444984064349676, + "grad_norm": 0.8422814607620239, + "learning_rate": 4.594006277209679e-05, + "loss": 1.4423, + "step": 10689 + }, + { + "epoch": 3.2448019426316588, + "grad_norm": 0.9718270897865295, + "learning_rate": 4.593500050622659e-05, + "loss": 0.7361, + "step": 10690 + }, + { + "epoch": 3.2451054788283504, + "grad_norm": 1.0493378639221191, + "learning_rate": 4.5929938240356386e-05, + "loss": 0.7964, + "step": 10691 + }, + { + "epoch": 3.2454090150250416, + "grad_norm": 1.2916702032089233, + "learning_rate": 4.592487597448618e-05, + "loss": 0.7989, + "step": 10692 + }, + { + "epoch": 3.2457125512217333, + "grad_norm": 0.8385698199272156, + "learning_rate": 4.5919813708615976e-05, + "loss": 0.9955, + "step": 10693 + }, + { + "epoch": 3.2460160874184245, + "grad_norm": 1.075981616973877, + "learning_rate": 4.591475144274577e-05, + "loss": 0.5251, + "step": 10694 + }, + { + "epoch": 3.246319623615116, + "grad_norm": 0.8651655316352844, + "learning_rate": 4.590968917687557e-05, + "loss": 1.0742, + "step": 10695 + }, + { + "epoch": 3.2466231598118074, + "grad_norm": 0.8534205555915833, + "learning_rate": 4.590462691100537e-05, + "loss": 0.463, + "step": 10696 + }, + { + "epoch": 3.246926696008499, + "grad_norm": 0.9287769198417664, + "learning_rate": 4.589956464513516e-05, + "loss": 0.6282, + "step": 10697 + }, + { + "epoch": 3.2472302322051902, + "grad_norm": 1.0819957256317139, + "learning_rate": 4.5894502379264956e-05, + "loss": 1.1323, + "step": 10698 + }, + { + "epoch": 3.247533768401882, + "grad_norm": 1.050036907196045, + "learning_rate": 4.588944011339476e-05, + "loss": 0.6086, + "step": 10699 + }, + { + "epoch": 3.2478373045985736, + "grad_norm": 0.9179292917251587, + "learning_rate": 4.588437784752456e-05, + "loss": 0.7968, + "step": 10700 + }, + { + "epoch": 3.2481408407952648, + "grad_norm": 0.6632173657417297, + "learning_rate": 4.5879315581654354e-05, + "loss": 0.8743, + "step": 10701 + }, + { + "epoch": 3.2484443769919564, + "grad_norm": 0.972480833530426, + "learning_rate": 4.587425331578415e-05, + "loss": 0.7915, + "step": 10702 + }, + { + "epoch": 3.2487479131886476, + "grad_norm": 1.1181424856185913, + "learning_rate": 4.5869191049913944e-05, + "loss": 0.802, + "step": 10703 + }, + { + "epoch": 3.2490514493853393, + "grad_norm": 0.783929705619812, + "learning_rate": 4.586412878404374e-05, + "loss": 1.2416, + "step": 10704 + }, + { + "epoch": 3.2493549855820305, + "grad_norm": 0.8930378556251526, + "learning_rate": 4.585906651817354e-05, + "loss": 0.7902, + "step": 10705 + }, + { + "epoch": 3.249658521778722, + "grad_norm": 0.9828311800956726, + "learning_rate": 4.5854004252303335e-05, + "loss": 1.2604, + "step": 10706 + }, + { + "epoch": 3.2499620579754134, + "grad_norm": 0.9092838764190674, + "learning_rate": 4.584894198643313e-05, + "loss": 1.292, + "step": 10707 + }, + { + "epoch": 3.250265594172105, + "grad_norm": 0.9778360724449158, + "learning_rate": 4.5843879720562925e-05, + "loss": 1.1521, + "step": 10708 + }, + { + "epoch": 3.2505691303687962, + "grad_norm": 0.9975069165229797, + "learning_rate": 4.5838817454692726e-05, + "loss": 0.7547, + "step": 10709 + }, + { + "epoch": 3.250872666565488, + "grad_norm": 1.1369693279266357, + "learning_rate": 4.583375518882252e-05, + "loss": 0.7372, + "step": 10710 + }, + { + "epoch": 3.2511762027621796, + "grad_norm": 1.1483356952667236, + "learning_rate": 4.5828692922952316e-05, + "loss": 1.1202, + "step": 10711 + }, + { + "epoch": 3.2514797389588708, + "grad_norm": 1.0134035348892212, + "learning_rate": 4.582363065708211e-05, + "loss": 0.5456, + "step": 10712 + }, + { + "epoch": 3.2517832751555624, + "grad_norm": 0.8458219170570374, + "learning_rate": 4.5818568391211905e-05, + "loss": 1.2801, + "step": 10713 + }, + { + "epoch": 3.2520868113522536, + "grad_norm": 1.0307174921035767, + "learning_rate": 4.581350612534171e-05, + "loss": 0.983, + "step": 10714 + }, + { + "epoch": 3.2523903475489453, + "grad_norm": 0.9404361248016357, + "learning_rate": 4.58084438594715e-05, + "loss": 1.1978, + "step": 10715 + }, + { + "epoch": 3.2526938837456365, + "grad_norm": 0.9472918510437012, + "learning_rate": 4.5803381593601297e-05, + "loss": 0.9409, + "step": 10716 + }, + { + "epoch": 3.252997419942328, + "grad_norm": 0.761706531047821, + "learning_rate": 4.579831932773109e-05, + "loss": 0.5885, + "step": 10717 + }, + { + "epoch": 3.25330095613902, + "grad_norm": 1.1660070419311523, + "learning_rate": 4.5793257061860886e-05, + "loss": 0.6708, + "step": 10718 + }, + { + "epoch": 3.253604492335711, + "grad_norm": 1.000372052192688, + "learning_rate": 4.578819479599069e-05, + "loss": 0.7903, + "step": 10719 + }, + { + "epoch": 3.2539080285324022, + "grad_norm": 1.0931434631347656, + "learning_rate": 4.578313253012048e-05, + "loss": 1.1992, + "step": 10720 + }, + { + "epoch": 3.254211564729094, + "grad_norm": 1.1989152431488037, + "learning_rate": 4.577807026425028e-05, + "loss": 0.6295, + "step": 10721 + }, + { + "epoch": 3.2545151009257856, + "grad_norm": 1.0379623174667358, + "learning_rate": 4.577300799838008e-05, + "loss": 1.1497, + "step": 10722 + }, + { + "epoch": 3.2548186371224768, + "grad_norm": 0.9696967601776123, + "learning_rate": 4.5767945732509874e-05, + "loss": 1.41, + "step": 10723 + }, + { + "epoch": 3.2551221733191684, + "grad_norm": 1.1930303573608398, + "learning_rate": 4.5762883466639675e-05, + "loss": 0.7729, + "step": 10724 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 1.0501840114593506, + "learning_rate": 4.575782120076947e-05, + "loss": 1.0502, + "step": 10725 + }, + { + "epoch": 3.2557292457125513, + "grad_norm": 1.0565099716186523, + "learning_rate": 4.5752758934899265e-05, + "loss": 0.9146, + "step": 10726 + }, + { + "epoch": 3.2560327819092425, + "grad_norm": 0.7854439616203308, + "learning_rate": 4.574769666902906e-05, + "loss": 1.5723, + "step": 10727 + }, + { + "epoch": 3.256336318105934, + "grad_norm": 0.9324937462806702, + "learning_rate": 4.5742634403158854e-05, + "loss": 0.7123, + "step": 10728 + }, + { + "epoch": 3.256639854302626, + "grad_norm": 1.0135712623596191, + "learning_rate": 4.5737572137288656e-05, + "loss": 0.9938, + "step": 10729 + }, + { + "epoch": 3.256943390499317, + "grad_norm": 0.934712827205658, + "learning_rate": 4.573250987141845e-05, + "loss": 1.1236, + "step": 10730 + }, + { + "epoch": 3.2572469266960087, + "grad_norm": 1.1099516153335571, + "learning_rate": 4.5727447605548246e-05, + "loss": 0.9718, + "step": 10731 + }, + { + "epoch": 3.2575504628927, + "grad_norm": 0.9483317136764526, + "learning_rate": 4.572238533967804e-05, + "loss": 0.9117, + "step": 10732 + }, + { + "epoch": 3.2578539990893916, + "grad_norm": 1.1352256536483765, + "learning_rate": 4.5717323073807835e-05, + "loss": 0.7473, + "step": 10733 + }, + { + "epoch": 3.2581575352860828, + "grad_norm": 0.8792520761489868, + "learning_rate": 4.571226080793764e-05, + "loss": 0.8135, + "step": 10734 + }, + { + "epoch": 3.2584610714827744, + "grad_norm": 1.1803364753723145, + "learning_rate": 4.570719854206743e-05, + "loss": 0.5483, + "step": 10735 + }, + { + "epoch": 3.2587646076794656, + "grad_norm": 1.0876795053482056, + "learning_rate": 4.5702136276197226e-05, + "loss": 0.9788, + "step": 10736 + }, + { + "epoch": 3.2590681438761573, + "grad_norm": 1.0496044158935547, + "learning_rate": 4.569707401032702e-05, + "loss": 1.0614, + "step": 10737 + }, + { + "epoch": 3.2593716800728485, + "grad_norm": 1.1093744039535522, + "learning_rate": 4.569201174445682e-05, + "loss": 1.0443, + "step": 10738 + }, + { + "epoch": 3.25967521626954, + "grad_norm": 1.0227601528167725, + "learning_rate": 4.568694947858662e-05, + "loss": 1.1106, + "step": 10739 + }, + { + "epoch": 3.259978752466232, + "grad_norm": 1.3634077310562134, + "learning_rate": 4.568188721271641e-05, + "loss": 0.7022, + "step": 10740 + }, + { + "epoch": 3.260282288662923, + "grad_norm": 1.1404317617416382, + "learning_rate": 4.567682494684621e-05, + "loss": 0.8785, + "step": 10741 + }, + { + "epoch": 3.2605858248596147, + "grad_norm": 0.863370954990387, + "learning_rate": 4.5671762680976e-05, + "loss": 0.6955, + "step": 10742 + }, + { + "epoch": 3.260889361056306, + "grad_norm": 1.0121564865112305, + "learning_rate": 4.5666700415105803e-05, + "loss": 1.3315, + "step": 10743 + }, + { + "epoch": 3.2611928972529975, + "grad_norm": 1.1657657623291016, + "learning_rate": 4.5661638149235605e-05, + "loss": 0.7443, + "step": 10744 + }, + { + "epoch": 3.2614964334496888, + "grad_norm": 1.1776002645492554, + "learning_rate": 4.56565758833654e-05, + "loss": 1.002, + "step": 10745 + }, + { + "epoch": 3.2617999696463804, + "grad_norm": 1.0279992818832397, + "learning_rate": 4.5651513617495195e-05, + "loss": 0.5771, + "step": 10746 + }, + { + "epoch": 3.2621035058430716, + "grad_norm": 1.0485210418701172, + "learning_rate": 4.564645135162499e-05, + "loss": 1.0635, + "step": 10747 + }, + { + "epoch": 3.2624070420397633, + "grad_norm": 1.22089421749115, + "learning_rate": 4.564138908575479e-05, + "loss": 1.0436, + "step": 10748 + }, + { + "epoch": 3.2627105782364545, + "grad_norm": 1.2425166368484497, + "learning_rate": 4.5636326819884586e-05, + "loss": 1.0811, + "step": 10749 + }, + { + "epoch": 3.263014114433146, + "grad_norm": 1.0394881963729858, + "learning_rate": 4.563126455401438e-05, + "loss": 1.164, + "step": 10750 + }, + { + "epoch": 3.263317650629838, + "grad_norm": 1.0095058679580688, + "learning_rate": 4.5626202288144175e-05, + "loss": 0.9988, + "step": 10751 + }, + { + "epoch": 3.263621186826529, + "grad_norm": 0.8187227845191956, + "learning_rate": 4.562114002227397e-05, + "loss": 1.1253, + "step": 10752 + }, + { + "epoch": 3.2639247230232207, + "grad_norm": 1.0649471282958984, + "learning_rate": 4.561607775640377e-05, + "loss": 1.1652, + "step": 10753 + }, + { + "epoch": 3.264228259219912, + "grad_norm": 0.9002078175544739, + "learning_rate": 4.5611015490533566e-05, + "loss": 0.8038, + "step": 10754 + }, + { + "epoch": 3.2645317954166035, + "grad_norm": 0.8418742418289185, + "learning_rate": 4.560595322466336e-05, + "loss": 1.59, + "step": 10755 + }, + { + "epoch": 3.2648353316132948, + "grad_norm": 0.8552321195602417, + "learning_rate": 4.5600890958793156e-05, + "loss": 1.3527, + "step": 10756 + }, + { + "epoch": 3.2651388678099864, + "grad_norm": 1.034485936164856, + "learning_rate": 4.559582869292295e-05, + "loss": 1.1875, + "step": 10757 + }, + { + "epoch": 3.2654424040066776, + "grad_norm": 1.0891923904418945, + "learning_rate": 4.559076642705275e-05, + "loss": 0.4991, + "step": 10758 + }, + { + "epoch": 3.2657459402033693, + "grad_norm": 0.9805664420127869, + "learning_rate": 4.558570416118255e-05, + "loss": 1.0874, + "step": 10759 + }, + { + "epoch": 3.2660494764000605, + "grad_norm": 1.0119584798812866, + "learning_rate": 4.558064189531234e-05, + "loss": 1.0662, + "step": 10760 + }, + { + "epoch": 3.266353012596752, + "grad_norm": 1.0217180252075195, + "learning_rate": 4.557557962944214e-05, + "loss": 0.7705, + "step": 10761 + }, + { + "epoch": 3.266656548793444, + "grad_norm": 0.8054290413856506, + "learning_rate": 4.557051736357194e-05, + "loss": 0.8947, + "step": 10762 + }, + { + "epoch": 3.266960084990135, + "grad_norm": 1.1457386016845703, + "learning_rate": 4.556545509770173e-05, + "loss": 0.9263, + "step": 10763 + }, + { + "epoch": 3.2672636211868267, + "grad_norm": 0.8959583044052124, + "learning_rate": 4.556039283183153e-05, + "loss": 0.3086, + "step": 10764 + }, + { + "epoch": 3.267567157383518, + "grad_norm": 0.9370061159133911, + "learning_rate": 4.555533056596132e-05, + "loss": 0.7362, + "step": 10765 + }, + { + "epoch": 3.2678706935802095, + "grad_norm": 1.0184650421142578, + "learning_rate": 4.5550268300091124e-05, + "loss": 1.0134, + "step": 10766 + }, + { + "epoch": 3.2681742297769008, + "grad_norm": 1.1120182275772095, + "learning_rate": 4.554520603422092e-05, + "loss": 0.9048, + "step": 10767 + }, + { + "epoch": 3.2684777659735924, + "grad_norm": 1.038080096244812, + "learning_rate": 4.554014376835072e-05, + "loss": 1.0289, + "step": 10768 + }, + { + "epoch": 3.2687813021702836, + "grad_norm": 1.1295963525772095, + "learning_rate": 4.5535081502480516e-05, + "loss": 0.6511, + "step": 10769 + }, + { + "epoch": 3.2690848383669753, + "grad_norm": 1.0444300174713135, + "learning_rate": 4.553001923661031e-05, + "loss": 1.104, + "step": 10770 + }, + { + "epoch": 3.2693883745636665, + "grad_norm": 0.8410137295722961, + "learning_rate": 4.5524956970740105e-05, + "loss": 0.6854, + "step": 10771 + }, + { + "epoch": 3.269691910760358, + "grad_norm": 1.008929967880249, + "learning_rate": 4.55198947048699e-05, + "loss": 1.1616, + "step": 10772 + }, + { + "epoch": 3.26999544695705, + "grad_norm": 1.3667205572128296, + "learning_rate": 4.55148324389997e-05, + "loss": 0.7699, + "step": 10773 + }, + { + "epoch": 3.270298983153741, + "grad_norm": 0.8297469019889832, + "learning_rate": 4.5509770173129496e-05, + "loss": 0.7757, + "step": 10774 + }, + { + "epoch": 3.2706025193504327, + "grad_norm": 1.2971643209457397, + "learning_rate": 4.550470790725929e-05, + "loss": 0.7429, + "step": 10775 + }, + { + "epoch": 3.270906055547124, + "grad_norm": 1.182885766029358, + "learning_rate": 4.5499645641389086e-05, + "loss": 0.5208, + "step": 10776 + }, + { + "epoch": 3.2712095917438155, + "grad_norm": 1.083641529083252, + "learning_rate": 4.549458337551889e-05, + "loss": 0.8568, + "step": 10777 + }, + { + "epoch": 3.2715131279405067, + "grad_norm": 1.0737552642822266, + "learning_rate": 4.548952110964868e-05, + "loss": 0.8355, + "step": 10778 + }, + { + "epoch": 3.2718166641371984, + "grad_norm": 1.0033292770385742, + "learning_rate": 4.548445884377848e-05, + "loss": 0.8186, + "step": 10779 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.9690437912940979, + "learning_rate": 4.547939657790827e-05, + "loss": 0.9415, + "step": 10780 + }, + { + "epoch": 3.2724237365305813, + "grad_norm": 1.1716777086257935, + "learning_rate": 4.5474334312038067e-05, + "loss": 0.9336, + "step": 10781 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 0.9766501784324646, + "learning_rate": 4.546927204616787e-05, + "loss": 0.9587, + "step": 10782 + }, + { + "epoch": 3.273030808923964, + "grad_norm": 1.0051504373550415, + "learning_rate": 4.546420978029766e-05, + "loss": 0.9871, + "step": 10783 + }, + { + "epoch": 3.273334345120656, + "grad_norm": 1.0697041749954224, + "learning_rate": 4.545914751442746e-05, + "loss": 1.1107, + "step": 10784 + }, + { + "epoch": 3.273637881317347, + "grad_norm": 0.9957068562507629, + "learning_rate": 4.545408524855725e-05, + "loss": 0.909, + "step": 10785 + }, + { + "epoch": 3.2739414175140387, + "grad_norm": 1.003722906112671, + "learning_rate": 4.544902298268705e-05, + "loss": 0.8367, + "step": 10786 + }, + { + "epoch": 3.27424495371073, + "grad_norm": 0.9848899841308594, + "learning_rate": 4.544396071681685e-05, + "loss": 0.6267, + "step": 10787 + }, + { + "epoch": 3.2745484899074215, + "grad_norm": 1.026005506515503, + "learning_rate": 4.543889845094665e-05, + "loss": 0.8625, + "step": 10788 + }, + { + "epoch": 3.2748520261041127, + "grad_norm": 1.042607069015503, + "learning_rate": 4.5433836185076445e-05, + "loss": 0.9467, + "step": 10789 + }, + { + "epoch": 3.2751555623008044, + "grad_norm": 1.153802514076233, + "learning_rate": 4.542877391920624e-05, + "loss": 0.6135, + "step": 10790 + }, + { + "epoch": 3.275459098497496, + "grad_norm": 1.2635871171951294, + "learning_rate": 4.5423711653336035e-05, + "loss": 0.6866, + "step": 10791 + }, + { + "epoch": 3.2757626346941873, + "grad_norm": 0.8729458451271057, + "learning_rate": 4.5418649387465836e-05, + "loss": 0.9745, + "step": 10792 + }, + { + "epoch": 3.276066170890879, + "grad_norm": 1.0708637237548828, + "learning_rate": 4.541358712159563e-05, + "loss": 1.3473, + "step": 10793 + }, + { + "epoch": 3.27636970708757, + "grad_norm": 0.7690752148628235, + "learning_rate": 4.5408524855725426e-05, + "loss": 0.4931, + "step": 10794 + }, + { + "epoch": 3.276673243284262, + "grad_norm": 0.8505693674087524, + "learning_rate": 4.540346258985522e-05, + "loss": 1.5089, + "step": 10795 + }, + { + "epoch": 3.276976779480953, + "grad_norm": 0.7915980219841003, + "learning_rate": 4.5398400323985016e-05, + "loss": 1.0714, + "step": 10796 + }, + { + "epoch": 3.2772803156776447, + "grad_norm": 0.818832278251648, + "learning_rate": 4.539333805811482e-05, + "loss": 1.0885, + "step": 10797 + }, + { + "epoch": 3.277583851874336, + "grad_norm": 1.0225110054016113, + "learning_rate": 4.538827579224461e-05, + "loss": 0.9013, + "step": 10798 + }, + { + "epoch": 3.2778873880710275, + "grad_norm": 0.9142088294029236, + "learning_rate": 4.538321352637441e-05, + "loss": 0.7362, + "step": 10799 + }, + { + "epoch": 3.2781909242677187, + "grad_norm": 0.9426360726356506, + "learning_rate": 4.53781512605042e-05, + "loss": 0.5428, + "step": 10800 + }, + { + "epoch": 3.2784944604644104, + "grad_norm": 0.9673095941543579, + "learning_rate": 4.5373088994634e-05, + "loss": 0.3388, + "step": 10801 + }, + { + "epoch": 3.278797996661102, + "grad_norm": 0.7650625705718994, + "learning_rate": 4.53680267287638e-05, + "loss": 1.1231, + "step": 10802 + }, + { + "epoch": 3.2791015328577933, + "grad_norm": 0.7977088093757629, + "learning_rate": 4.536296446289359e-05, + "loss": 0.7923, + "step": 10803 + }, + { + "epoch": 3.279405069054485, + "grad_norm": 0.8154423832893372, + "learning_rate": 4.535790219702339e-05, + "loss": 0.5937, + "step": 10804 + }, + { + "epoch": 3.279708605251176, + "grad_norm": 0.9257239699363708, + "learning_rate": 4.535283993115318e-05, + "loss": 0.6683, + "step": 10805 + }, + { + "epoch": 3.280012141447868, + "grad_norm": 0.9158928394317627, + "learning_rate": 4.5347777665282984e-05, + "loss": 0.9346, + "step": 10806 + }, + { + "epoch": 3.280315677644559, + "grad_norm": 0.9674535393714905, + "learning_rate": 4.534271539941278e-05, + "loss": 1.1996, + "step": 10807 + }, + { + "epoch": 3.2806192138412507, + "grad_norm": 0.8085349202156067, + "learning_rate": 4.5337653133542573e-05, + "loss": 1.0492, + "step": 10808 + }, + { + "epoch": 3.280922750037942, + "grad_norm": 1.0119742155075073, + "learning_rate": 4.533259086767237e-05, + "loss": 0.6611, + "step": 10809 + }, + { + "epoch": 3.2812262862346335, + "grad_norm": 1.1893967390060425, + "learning_rate": 4.532752860180216e-05, + "loss": 0.7196, + "step": 10810 + }, + { + "epoch": 3.2815298224313247, + "grad_norm": 1.0236854553222656, + "learning_rate": 4.5322466335931965e-05, + "loss": 0.7379, + "step": 10811 + }, + { + "epoch": 3.2818333586280164, + "grad_norm": 0.7365982532501221, + "learning_rate": 4.5317404070061766e-05, + "loss": 0.9141, + "step": 10812 + }, + { + "epoch": 3.282136894824708, + "grad_norm": 1.2536211013793945, + "learning_rate": 4.531234180419156e-05, + "loss": 1.0218, + "step": 10813 + }, + { + "epoch": 3.2824404310213993, + "grad_norm": 1.0975487232208252, + "learning_rate": 4.5307279538321356e-05, + "loss": 0.8333, + "step": 10814 + }, + { + "epoch": 3.282743967218091, + "grad_norm": 1.0106996297836304, + "learning_rate": 4.530221727245115e-05, + "loss": 1.0573, + "step": 10815 + }, + { + "epoch": 3.283047503414782, + "grad_norm": 0.7988746762275696, + "learning_rate": 4.529715500658095e-05, + "loss": 1.0692, + "step": 10816 + }, + { + "epoch": 3.283351039611474, + "grad_norm": 0.989445149898529, + "learning_rate": 4.529209274071075e-05, + "loss": 0.8215, + "step": 10817 + }, + { + "epoch": 3.283654575808165, + "grad_norm": 0.8841937184333801, + "learning_rate": 4.528703047484054e-05, + "loss": 0.3167, + "step": 10818 + }, + { + "epoch": 3.2839581120048567, + "grad_norm": 0.976102888584137, + "learning_rate": 4.5281968208970337e-05, + "loss": 0.7513, + "step": 10819 + }, + { + "epoch": 3.284261648201548, + "grad_norm": 1.139563798904419, + "learning_rate": 4.527690594310013e-05, + "loss": 0.9575, + "step": 10820 + }, + { + "epoch": 3.2845651843982395, + "grad_norm": 1.1920915842056274, + "learning_rate": 4.527184367722993e-05, + "loss": 0.4345, + "step": 10821 + }, + { + "epoch": 3.2848687205949307, + "grad_norm": 0.7356903553009033, + "learning_rate": 4.526678141135973e-05, + "loss": 0.7213, + "step": 10822 + }, + { + "epoch": 3.2851722567916224, + "grad_norm": 1.1041041612625122, + "learning_rate": 4.526171914548952e-05, + "loss": 1.1833, + "step": 10823 + }, + { + "epoch": 3.285475792988314, + "grad_norm": 0.8261696100234985, + "learning_rate": 4.525665687961932e-05, + "loss": 1.0317, + "step": 10824 + }, + { + "epoch": 3.2857793291850053, + "grad_norm": 1.0006409883499146, + "learning_rate": 4.525159461374911e-05, + "loss": 0.6768, + "step": 10825 + }, + { + "epoch": 3.286082865381697, + "grad_norm": 0.9552530646324158, + "learning_rate": 4.5246532347878914e-05, + "loss": 1.4078, + "step": 10826 + }, + { + "epoch": 3.286386401578388, + "grad_norm": 1.0804533958435059, + "learning_rate": 4.524147008200871e-05, + "loss": 0.5627, + "step": 10827 + }, + { + "epoch": 3.28668993777508, + "grad_norm": 1.1761291027069092, + "learning_rate": 4.52364078161385e-05, + "loss": 0.5646, + "step": 10828 + }, + { + "epoch": 3.286993473971771, + "grad_norm": 1.11805260181427, + "learning_rate": 4.52313455502683e-05, + "loss": 1.0724, + "step": 10829 + }, + { + "epoch": 3.2872970101684627, + "grad_norm": 1.1052323579788208, + "learning_rate": 4.52262832843981e-05, + "loss": 0.8955, + "step": 10830 + }, + { + "epoch": 3.287600546365154, + "grad_norm": 0.9430875182151794, + "learning_rate": 4.5221221018527894e-05, + "loss": 0.771, + "step": 10831 + }, + { + "epoch": 3.2879040825618455, + "grad_norm": 0.9106606841087341, + "learning_rate": 4.521615875265769e-05, + "loss": 0.8677, + "step": 10832 + }, + { + "epoch": 3.2882076187585367, + "grad_norm": 0.9721511006355286, + "learning_rate": 4.521109648678749e-05, + "loss": 1.1159, + "step": 10833 + }, + { + "epoch": 3.2885111549552284, + "grad_norm": 0.9404895305633545, + "learning_rate": 4.5206034220917286e-05, + "loss": 1.1512, + "step": 10834 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 1.0442395210266113, + "learning_rate": 4.520097195504708e-05, + "loss": 1.4154, + "step": 10835 + }, + { + "epoch": 3.2891182273486113, + "grad_norm": 0.9572499394416809, + "learning_rate": 4.519590968917688e-05, + "loss": 1.4188, + "step": 10836 + }, + { + "epoch": 3.289421763545303, + "grad_norm": 0.820774257183075, + "learning_rate": 4.519084742330668e-05, + "loss": 1.4798, + "step": 10837 + }, + { + "epoch": 3.289725299741994, + "grad_norm": 1.1621805429458618, + "learning_rate": 4.518578515743647e-05, + "loss": 0.8693, + "step": 10838 + }, + { + "epoch": 3.290028835938686, + "grad_norm": 1.2397860288619995, + "learning_rate": 4.5180722891566266e-05, + "loss": 0.9029, + "step": 10839 + }, + { + "epoch": 3.290332372135377, + "grad_norm": 1.1109083890914917, + "learning_rate": 4.517566062569607e-05, + "loss": 0.8677, + "step": 10840 + }, + { + "epoch": 3.2906359083320686, + "grad_norm": 0.9827634692192078, + "learning_rate": 4.517059835982586e-05, + "loss": 0.9328, + "step": 10841 + }, + { + "epoch": 3.29093944452876, + "grad_norm": 0.8185612559318542, + "learning_rate": 4.516553609395566e-05, + "loss": 0.9312, + "step": 10842 + }, + { + "epoch": 3.2912429807254515, + "grad_norm": 1.061167597770691, + "learning_rate": 4.516047382808545e-05, + "loss": 1.0439, + "step": 10843 + }, + { + "epoch": 3.2915465169221427, + "grad_norm": 0.9955297112464905, + "learning_rate": 4.515541156221525e-05, + "loss": 0.9614, + "step": 10844 + }, + { + "epoch": 3.2918500531188344, + "grad_norm": 0.895663321018219, + "learning_rate": 4.515034929634505e-05, + "loss": 1.2081, + "step": 10845 + }, + { + "epoch": 3.292153589315526, + "grad_norm": 0.819865345954895, + "learning_rate": 4.5145287030474843e-05, + "loss": 0.6867, + "step": 10846 + }, + { + "epoch": 3.2924571255122173, + "grad_norm": 0.7340788245201111, + "learning_rate": 4.514022476460464e-05, + "loss": 1.3062, + "step": 10847 + }, + { + "epoch": 3.292760661708909, + "grad_norm": 0.9020425081253052, + "learning_rate": 4.513516249873443e-05, + "loss": 1.1613, + "step": 10848 + }, + { + "epoch": 3.2930641979056, + "grad_norm": 0.9901716709136963, + "learning_rate": 4.513010023286423e-05, + "loss": 0.7686, + "step": 10849 + }, + { + "epoch": 3.2933677341022918, + "grad_norm": 1.0959265232086182, + "learning_rate": 4.512503796699403e-05, + "loss": 1.15, + "step": 10850 + }, + { + "epoch": 3.293671270298983, + "grad_norm": 0.9881561994552612, + "learning_rate": 4.5119975701123824e-05, + "loss": 0.4527, + "step": 10851 + }, + { + "epoch": 3.2939748064956746, + "grad_norm": 1.2317548990249634, + "learning_rate": 4.511491343525362e-05, + "loss": 0.8456, + "step": 10852 + }, + { + "epoch": 3.2942783426923663, + "grad_norm": 0.9707849621772766, + "learning_rate": 4.5109851169383414e-05, + "loss": 0.6435, + "step": 10853 + }, + { + "epoch": 3.2945818788890575, + "grad_norm": 0.9256259202957153, + "learning_rate": 4.5104788903513215e-05, + "loss": 1.1475, + "step": 10854 + }, + { + "epoch": 3.294885415085749, + "grad_norm": 1.0185927152633667, + "learning_rate": 4.509972663764302e-05, + "loss": 0.5531, + "step": 10855 + }, + { + "epoch": 3.2951889512824404, + "grad_norm": 1.201206922531128, + "learning_rate": 4.509466437177281e-05, + "loss": 0.8396, + "step": 10856 + }, + { + "epoch": 3.295492487479132, + "grad_norm": 0.7882808446884155, + "learning_rate": 4.5089602105902607e-05, + "loss": 0.6557, + "step": 10857 + }, + { + "epoch": 3.2957960236758232, + "grad_norm": 1.0940752029418945, + "learning_rate": 4.50845398400324e-05, + "loss": 0.9749, + "step": 10858 + }, + { + "epoch": 3.296099559872515, + "grad_norm": 1.0860530138015747, + "learning_rate": 4.5079477574162196e-05, + "loss": 1.2654, + "step": 10859 + }, + { + "epoch": 3.296403096069206, + "grad_norm": 0.8823529481887817, + "learning_rate": 4.5074415308292e-05, + "loss": 1.0959, + "step": 10860 + }, + { + "epoch": 3.2967066322658978, + "grad_norm": 1.0525271892547607, + "learning_rate": 4.506935304242179e-05, + "loss": 0.7816, + "step": 10861 + }, + { + "epoch": 3.297010168462589, + "grad_norm": 0.8928176760673523, + "learning_rate": 4.506429077655159e-05, + "loss": 1.207, + "step": 10862 + }, + { + "epoch": 3.2973137046592806, + "grad_norm": 0.74480140209198, + "learning_rate": 4.505922851068138e-05, + "loss": 0.7624, + "step": 10863 + }, + { + "epoch": 3.2976172408559723, + "grad_norm": 1.138056755065918, + "learning_rate": 4.505416624481118e-05, + "loss": 1.2181, + "step": 10864 + }, + { + "epoch": 3.2979207770526635, + "grad_norm": 0.9833407402038574, + "learning_rate": 4.504910397894098e-05, + "loss": 0.4512, + "step": 10865 + }, + { + "epoch": 3.298224313249355, + "grad_norm": 0.8189109563827515, + "learning_rate": 4.504404171307077e-05, + "loss": 0.6136, + "step": 10866 + }, + { + "epoch": 3.2985278494460464, + "grad_norm": 0.9859591722488403, + "learning_rate": 4.503897944720057e-05, + "loss": 1.3062, + "step": 10867 + }, + { + "epoch": 3.298831385642738, + "grad_norm": 1.1034716367721558, + "learning_rate": 4.503391718133036e-05, + "loss": 0.5403, + "step": 10868 + }, + { + "epoch": 3.2991349218394292, + "grad_norm": 1.2957113981246948, + "learning_rate": 4.5028854915460164e-05, + "loss": 0.6054, + "step": 10869 + }, + { + "epoch": 3.299438458036121, + "grad_norm": 1.1714705228805542, + "learning_rate": 4.502379264958996e-05, + "loss": 1.1525, + "step": 10870 + }, + { + "epoch": 3.299741994232812, + "grad_norm": 0.9049932956695557, + "learning_rate": 4.5018730383719754e-05, + "loss": 0.5488, + "step": 10871 + }, + { + "epoch": 3.3000455304295038, + "grad_norm": 0.9690504670143127, + "learning_rate": 4.501366811784955e-05, + "loss": 0.9167, + "step": 10872 + }, + { + "epoch": 3.300349066626195, + "grad_norm": 1.1646034717559814, + "learning_rate": 4.5008605851979344e-05, + "loss": 0.7624, + "step": 10873 + }, + { + "epoch": 3.3006526028228866, + "grad_norm": 0.9217798113822937, + "learning_rate": 4.5003543586109145e-05, + "loss": 0.2453, + "step": 10874 + }, + { + "epoch": 3.3009561390195783, + "grad_norm": 1.0002036094665527, + "learning_rate": 4.499848132023894e-05, + "loss": 1.3231, + "step": 10875 + }, + { + "epoch": 3.3012596752162695, + "grad_norm": 0.9526854753494263, + "learning_rate": 4.4993419054368735e-05, + "loss": 1.0963, + "step": 10876 + }, + { + "epoch": 3.301563211412961, + "grad_norm": 0.6665928363800049, + "learning_rate": 4.4988356788498536e-05, + "loss": 1.0477, + "step": 10877 + }, + { + "epoch": 3.3018667476096524, + "grad_norm": 1.1407513618469238, + "learning_rate": 4.498329452262833e-05, + "loss": 0.8914, + "step": 10878 + }, + { + "epoch": 3.302170283806344, + "grad_norm": 1.0255204439163208, + "learning_rate": 4.497823225675813e-05, + "loss": 1.1079, + "step": 10879 + }, + { + "epoch": 3.3024738200030352, + "grad_norm": 1.0184648036956787, + "learning_rate": 4.497316999088793e-05, + "loss": 1.0209, + "step": 10880 + }, + { + "epoch": 3.302777356199727, + "grad_norm": 0.9730910062789917, + "learning_rate": 4.496810772501772e-05, + "loss": 1.4784, + "step": 10881 + }, + { + "epoch": 3.303080892396418, + "grad_norm": 1.1805129051208496, + "learning_rate": 4.496304545914752e-05, + "loss": 0.699, + "step": 10882 + }, + { + "epoch": 3.3033844285931098, + "grad_norm": 1.1000862121582031, + "learning_rate": 4.495798319327731e-05, + "loss": 0.7366, + "step": 10883 + }, + { + "epoch": 3.303687964789801, + "grad_norm": 1.0862362384796143, + "learning_rate": 4.4952920927407113e-05, + "loss": 1.2201, + "step": 10884 + }, + { + "epoch": 3.3039915009864926, + "grad_norm": 1.286479115486145, + "learning_rate": 4.494785866153691e-05, + "loss": 0.8507, + "step": 10885 + }, + { + "epoch": 3.3042950371831843, + "grad_norm": 1.0260646343231201, + "learning_rate": 4.49427963956667e-05, + "loss": 1.2755, + "step": 10886 + }, + { + "epoch": 3.3045985733798755, + "grad_norm": 0.5905255079269409, + "learning_rate": 4.49377341297965e-05, + "loss": 0.7573, + "step": 10887 + }, + { + "epoch": 3.304902109576567, + "grad_norm": 0.7603309154510498, + "learning_rate": 4.493267186392629e-05, + "loss": 0.5304, + "step": 10888 + }, + { + "epoch": 3.3052056457732584, + "grad_norm": 1.2025705575942993, + "learning_rate": 4.4927609598056094e-05, + "loss": 0.9404, + "step": 10889 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.1327776908874512, + "learning_rate": 4.492254733218589e-05, + "loss": 1.0736, + "step": 10890 + }, + { + "epoch": 3.3058127181666412, + "grad_norm": 1.0493543148040771, + "learning_rate": 4.4917485066315684e-05, + "loss": 1.0705, + "step": 10891 + }, + { + "epoch": 3.306116254363333, + "grad_norm": 1.0355929136276245, + "learning_rate": 4.491242280044548e-05, + "loss": 1.1802, + "step": 10892 + }, + { + "epoch": 3.306419790560024, + "grad_norm": 1.2506612539291382, + "learning_rate": 4.490736053457528e-05, + "loss": 1.1651, + "step": 10893 + }, + { + "epoch": 3.3067233267567158, + "grad_norm": 1.2677823305130005, + "learning_rate": 4.4902298268705075e-05, + "loss": 0.7461, + "step": 10894 + }, + { + "epoch": 3.307026862953407, + "grad_norm": 0.7385201454162598, + "learning_rate": 4.489723600283487e-05, + "loss": 1.0815, + "step": 10895 + }, + { + "epoch": 3.3073303991500986, + "grad_norm": 1.2011213302612305, + "learning_rate": 4.4892173736964664e-05, + "loss": 0.5656, + "step": 10896 + }, + { + "epoch": 3.3076339353467903, + "grad_norm": 1.054531455039978, + "learning_rate": 4.488711147109446e-05, + "loss": 0.9415, + "step": 10897 + }, + { + "epoch": 3.3079374715434815, + "grad_norm": 0.7761757373809814, + "learning_rate": 4.488204920522426e-05, + "loss": 1.2643, + "step": 10898 + }, + { + "epoch": 3.308241007740173, + "grad_norm": 0.9899110198020935, + "learning_rate": 4.4876986939354056e-05, + "loss": 0.7178, + "step": 10899 + }, + { + "epoch": 3.3085445439368644, + "grad_norm": 1.0554609298706055, + "learning_rate": 4.487192467348386e-05, + "loss": 0.9268, + "step": 10900 + }, + { + "epoch": 3.308848080133556, + "grad_norm": 0.9690304398536682, + "learning_rate": 4.486686240761365e-05, + "loss": 0.4653, + "step": 10901 + }, + { + "epoch": 3.3091516163302472, + "grad_norm": 0.9855083227157593, + "learning_rate": 4.486180014174345e-05, + "loss": 0.7983, + "step": 10902 + }, + { + "epoch": 3.309455152526939, + "grad_norm": 1.1696903705596924, + "learning_rate": 4.485673787587324e-05, + "loss": 1.0691, + "step": 10903 + }, + { + "epoch": 3.30975868872363, + "grad_norm": 1.021609902381897, + "learning_rate": 4.485167561000304e-05, + "loss": 0.739, + "step": 10904 + }, + { + "epoch": 3.3100622249203218, + "grad_norm": 0.905699610710144, + "learning_rate": 4.484661334413284e-05, + "loss": 1.1371, + "step": 10905 + }, + { + "epoch": 3.310365761117013, + "grad_norm": 1.198501706123352, + "learning_rate": 4.484155107826263e-05, + "loss": 1.1738, + "step": 10906 + }, + { + "epoch": 3.3106692973137046, + "grad_norm": 1.210067629814148, + "learning_rate": 4.483648881239243e-05, + "loss": 0.7405, + "step": 10907 + }, + { + "epoch": 3.3109728335103963, + "grad_norm": 0.9705145955085754, + "learning_rate": 4.483142654652223e-05, + "loss": 1.1276, + "step": 10908 + }, + { + "epoch": 3.3112763697070875, + "grad_norm": 1.0239938497543335, + "learning_rate": 4.4826364280652024e-05, + "loss": 1.2271, + "step": 10909 + }, + { + "epoch": 3.311579905903779, + "grad_norm": 1.0348726511001587, + "learning_rate": 4.482130201478182e-05, + "loss": 0.7893, + "step": 10910 + }, + { + "epoch": 3.3118834421004704, + "grad_norm": 1.1248940229415894, + "learning_rate": 4.4816239748911614e-05, + "loss": 0.956, + "step": 10911 + }, + { + "epoch": 3.312186978297162, + "grad_norm": 1.0381546020507812, + "learning_rate": 4.481117748304141e-05, + "loss": 0.9179, + "step": 10912 + }, + { + "epoch": 3.3124905144938532, + "grad_norm": 1.0096803903579712, + "learning_rate": 4.480611521717121e-05, + "loss": 0.6458, + "step": 10913 + }, + { + "epoch": 3.312794050690545, + "grad_norm": 0.8411881923675537, + "learning_rate": 4.4801052951301005e-05, + "loss": 1.1148, + "step": 10914 + }, + { + "epoch": 3.3130975868872365, + "grad_norm": 0.8006409406661987, + "learning_rate": 4.47959906854308e-05, + "loss": 0.3209, + "step": 10915 + }, + { + "epoch": 3.3134011230839278, + "grad_norm": 1.1087896823883057, + "learning_rate": 4.4790928419560594e-05, + "loss": 0.6086, + "step": 10916 + }, + { + "epoch": 3.313704659280619, + "grad_norm": 1.3730086088180542, + "learning_rate": 4.478586615369039e-05, + "loss": 0.6804, + "step": 10917 + }, + { + "epoch": 3.3140081954773106, + "grad_norm": 1.0860646963119507, + "learning_rate": 4.478080388782019e-05, + "loss": 0.8157, + "step": 10918 + }, + { + "epoch": 3.3143117316740023, + "grad_norm": 0.9106684327125549, + "learning_rate": 4.4775741621949985e-05, + "loss": 1.321, + "step": 10919 + }, + { + "epoch": 3.3146152678706935, + "grad_norm": 0.8768418431282043, + "learning_rate": 4.477067935607978e-05, + "loss": 1.1126, + "step": 10920 + }, + { + "epoch": 3.314918804067385, + "grad_norm": 0.9425840377807617, + "learning_rate": 4.4765617090209575e-05, + "loss": 1.2696, + "step": 10921 + }, + { + "epoch": 3.3152223402640764, + "grad_norm": 0.9276318550109863, + "learning_rate": 4.4760554824339377e-05, + "loss": 0.4936, + "step": 10922 + }, + { + "epoch": 3.315525876460768, + "grad_norm": 0.7225998044013977, + "learning_rate": 4.475549255846918e-05, + "loss": 0.7051, + "step": 10923 + }, + { + "epoch": 3.3158294126574592, + "grad_norm": 0.8670743107795715, + "learning_rate": 4.475043029259897e-05, + "loss": 0.4315, + "step": 10924 + }, + { + "epoch": 3.316132948854151, + "grad_norm": 0.9413848519325256, + "learning_rate": 4.474536802672877e-05, + "loss": 0.9523, + "step": 10925 + }, + { + "epoch": 3.3164364850508425, + "grad_norm": 1.1482104063034058, + "learning_rate": 4.474030576085856e-05, + "loss": 0.9337, + "step": 10926 + }, + { + "epoch": 3.3167400212475338, + "grad_norm": 1.0772678852081299, + "learning_rate": 4.473524349498836e-05, + "loss": 0.6775, + "step": 10927 + }, + { + "epoch": 3.3170435574442254, + "grad_norm": 1.1513588428497314, + "learning_rate": 4.473018122911816e-05, + "loss": 0.977, + "step": 10928 + }, + { + "epoch": 3.3173470936409166, + "grad_norm": 1.1173572540283203, + "learning_rate": 4.4725118963247954e-05, + "loss": 1.1556, + "step": 10929 + }, + { + "epoch": 3.3176506298376083, + "grad_norm": 1.0247441530227661, + "learning_rate": 4.472005669737775e-05, + "loss": 1.0415, + "step": 10930 + }, + { + "epoch": 3.3179541660342995, + "grad_norm": 1.260218620300293, + "learning_rate": 4.471499443150754e-05, + "loss": 0.5901, + "step": 10931 + }, + { + "epoch": 3.318257702230991, + "grad_norm": 0.8548129200935364, + "learning_rate": 4.4709932165637345e-05, + "loss": 0.9202, + "step": 10932 + }, + { + "epoch": 3.3185612384276824, + "grad_norm": 1.195607304573059, + "learning_rate": 4.470486989976714e-05, + "loss": 0.9249, + "step": 10933 + }, + { + "epoch": 3.318864774624374, + "grad_norm": 1.0254417657852173, + "learning_rate": 4.4699807633896934e-05, + "loss": 0.8726, + "step": 10934 + }, + { + "epoch": 3.3191683108210652, + "grad_norm": 0.9081615805625916, + "learning_rate": 4.469474536802673e-05, + "loss": 0.8956, + "step": 10935 + }, + { + "epoch": 3.319471847017757, + "grad_norm": 0.9594190716743469, + "learning_rate": 4.4689683102156524e-05, + "loss": 0.9726, + "step": 10936 + }, + { + "epoch": 3.3197753832144485, + "grad_norm": 1.1127408742904663, + "learning_rate": 4.4684620836286326e-05, + "loss": 0.9327, + "step": 10937 + }, + { + "epoch": 3.3200789194111398, + "grad_norm": 1.2862492799758911, + "learning_rate": 4.467955857041612e-05, + "loss": 0.7903, + "step": 10938 + }, + { + "epoch": 3.3203824556078314, + "grad_norm": 0.9727734327316284, + "learning_rate": 4.4674496304545915e-05, + "loss": 1.0164, + "step": 10939 + }, + { + "epoch": 3.3206859918045226, + "grad_norm": 1.2466306686401367, + "learning_rate": 4.466943403867571e-05, + "loss": 0.8256, + "step": 10940 + }, + { + "epoch": 3.3209895280012143, + "grad_norm": 1.0429720878601074, + "learning_rate": 4.4664371772805505e-05, + "loss": 1.0149, + "step": 10941 + }, + { + "epoch": 3.3212930641979055, + "grad_norm": 1.301657795906067, + "learning_rate": 4.4659309506935306e-05, + "loss": 0.7164, + "step": 10942 + }, + { + "epoch": 3.321596600394597, + "grad_norm": 1.049839973449707, + "learning_rate": 4.46542472410651e-05, + "loss": 1.0403, + "step": 10943 + }, + { + "epoch": 3.3219001365912884, + "grad_norm": 0.788395881652832, + "learning_rate": 4.46491849751949e-05, + "loss": 1.3621, + "step": 10944 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 1.1904346942901611, + "learning_rate": 4.46441227093247e-05, + "loss": 1.0498, + "step": 10945 + }, + { + "epoch": 3.3225072089846712, + "grad_norm": 1.0005336999893188, + "learning_rate": 4.463906044345449e-05, + "loss": 1.0798, + "step": 10946 + }, + { + "epoch": 3.322810745181363, + "grad_norm": 0.8933365941047668, + "learning_rate": 4.4633998177584294e-05, + "loss": 1.3578, + "step": 10947 + }, + { + "epoch": 3.3231142813780545, + "grad_norm": 0.9748919010162354, + "learning_rate": 4.462893591171409e-05, + "loss": 1.2166, + "step": 10948 + }, + { + "epoch": 3.3234178175747457, + "grad_norm": 0.8765515685081482, + "learning_rate": 4.4623873645843883e-05, + "loss": 0.875, + "step": 10949 + }, + { + "epoch": 3.3237213537714374, + "grad_norm": 1.0601716041564941, + "learning_rate": 4.461881137997368e-05, + "loss": 0.793, + "step": 10950 + }, + { + "epoch": 3.3240248899681286, + "grad_norm": 0.848078727722168, + "learning_rate": 4.461374911410347e-05, + "loss": 0.7981, + "step": 10951 + }, + { + "epoch": 3.3243284261648203, + "grad_norm": 1.012888789176941, + "learning_rate": 4.4608686848233275e-05, + "loss": 1.0887, + "step": 10952 + }, + { + "epoch": 3.3246319623615115, + "grad_norm": 1.178536295890808, + "learning_rate": 4.460362458236307e-05, + "loss": 0.9093, + "step": 10953 + }, + { + "epoch": 3.324935498558203, + "grad_norm": 1.0819247961044312, + "learning_rate": 4.4598562316492864e-05, + "loss": 1.2429, + "step": 10954 + }, + { + "epoch": 3.3252390347548944, + "grad_norm": 1.0653207302093506, + "learning_rate": 4.459350005062266e-05, + "loss": 1.0703, + "step": 10955 + }, + { + "epoch": 3.325542570951586, + "grad_norm": 1.104990839958191, + "learning_rate": 4.4588437784752454e-05, + "loss": 1.2909, + "step": 10956 + }, + { + "epoch": 3.325846107148277, + "grad_norm": 1.111246943473816, + "learning_rate": 4.4583375518882255e-05, + "loss": 1.0612, + "step": 10957 + }, + { + "epoch": 3.326149643344969, + "grad_norm": 1.0744335651397705, + "learning_rate": 4.457831325301205e-05, + "loss": 0.8582, + "step": 10958 + }, + { + "epoch": 3.3264531795416605, + "grad_norm": 1.0981570482254028, + "learning_rate": 4.4573250987141845e-05, + "loss": 0.8824, + "step": 10959 + }, + { + "epoch": 3.3267567157383517, + "grad_norm": 1.046245813369751, + "learning_rate": 4.456818872127164e-05, + "loss": 0.9581, + "step": 10960 + }, + { + "epoch": 3.3270602519350434, + "grad_norm": 0.9277337789535522, + "learning_rate": 4.456312645540144e-05, + "loss": 1.3084, + "step": 10961 + }, + { + "epoch": 3.3273637881317346, + "grad_norm": 0.7490749359130859, + "learning_rate": 4.4558064189531236e-05, + "loss": 1.5654, + "step": 10962 + }, + { + "epoch": 3.3276673243284263, + "grad_norm": 0.927712082862854, + "learning_rate": 4.455300192366103e-05, + "loss": 1.2636, + "step": 10963 + }, + { + "epoch": 3.3279708605251175, + "grad_norm": 1.0332348346710205, + "learning_rate": 4.4547939657790826e-05, + "loss": 0.6393, + "step": 10964 + }, + { + "epoch": 3.328274396721809, + "grad_norm": 0.8732558488845825, + "learning_rate": 4.454287739192062e-05, + "loss": 1.159, + "step": 10965 + }, + { + "epoch": 3.3285779329185003, + "grad_norm": 1.1629630327224731, + "learning_rate": 4.453781512605042e-05, + "loss": 0.9721, + "step": 10966 + }, + { + "epoch": 3.328881469115192, + "grad_norm": 1.132015347480774, + "learning_rate": 4.4532752860180224e-05, + "loss": 0.8711, + "step": 10967 + }, + { + "epoch": 3.329185005311883, + "grad_norm": 0.7799678444862366, + "learning_rate": 4.452769059431002e-05, + "loss": 0.9832, + "step": 10968 + }, + { + "epoch": 3.329488541508575, + "grad_norm": 0.9966577887535095, + "learning_rate": 4.452262832843981e-05, + "loss": 1.2998, + "step": 10969 + }, + { + "epoch": 3.3297920777052665, + "grad_norm": 1.064753532409668, + "learning_rate": 4.451756606256961e-05, + "loss": 0.5435, + "step": 10970 + }, + { + "epoch": 3.3300956139019577, + "grad_norm": 0.8293728828430176, + "learning_rate": 4.451250379669941e-05, + "loss": 0.8826, + "step": 10971 + }, + { + "epoch": 3.3303991500986494, + "grad_norm": 0.8391801118850708, + "learning_rate": 4.4507441530829204e-05, + "loss": 0.835, + "step": 10972 + }, + { + "epoch": 3.3307026862953406, + "grad_norm": 1.1622800827026367, + "learning_rate": 4.4502379264959e-05, + "loss": 0.4311, + "step": 10973 + }, + { + "epoch": 3.3310062224920323, + "grad_norm": 1.087738275527954, + "learning_rate": 4.4497316999088794e-05, + "loss": 0.6573, + "step": 10974 + }, + { + "epoch": 3.3313097586887235, + "grad_norm": 1.005753993988037, + "learning_rate": 4.449225473321859e-05, + "loss": 0.8216, + "step": 10975 + }, + { + "epoch": 3.331613294885415, + "grad_norm": 1.2015050649642944, + "learning_rate": 4.448719246734839e-05, + "loss": 1.149, + "step": 10976 + }, + { + "epoch": 3.331916831082107, + "grad_norm": 1.1151928901672363, + "learning_rate": 4.4482130201478185e-05, + "loss": 0.7351, + "step": 10977 + }, + { + "epoch": 3.332220367278798, + "grad_norm": 1.2498070001602173, + "learning_rate": 4.447706793560798e-05, + "loss": 0.9142, + "step": 10978 + }, + { + "epoch": 3.332523903475489, + "grad_norm": 1.1304054260253906, + "learning_rate": 4.4472005669737775e-05, + "loss": 0.9886, + "step": 10979 + }, + { + "epoch": 3.332827439672181, + "grad_norm": 0.5813860297203064, + "learning_rate": 4.446694340386757e-05, + "loss": 1.0909, + "step": 10980 + }, + { + "epoch": 3.3331309758688725, + "grad_norm": 0.8783494234085083, + "learning_rate": 4.446188113799737e-05, + "loss": 1.0668, + "step": 10981 + }, + { + "epoch": 3.3334345120655637, + "grad_norm": 0.94245845079422, + "learning_rate": 4.4456818872127166e-05, + "loss": 1.0677, + "step": 10982 + }, + { + "epoch": 3.3337380482622554, + "grad_norm": 0.9128747582435608, + "learning_rate": 4.445175660625696e-05, + "loss": 0.5037, + "step": 10983 + }, + { + "epoch": 3.3340415844589466, + "grad_norm": 0.9580438733100891, + "learning_rate": 4.4446694340386755e-05, + "loss": 1.0751, + "step": 10984 + }, + { + "epoch": 3.3343451206556383, + "grad_norm": 0.7258634567260742, + "learning_rate": 4.444163207451656e-05, + "loss": 0.7628, + "step": 10985 + }, + { + "epoch": 3.3346486568523295, + "grad_norm": 1.1738688945770264, + "learning_rate": 4.443656980864635e-05, + "loss": 0.905, + "step": 10986 + }, + { + "epoch": 3.334952193049021, + "grad_norm": 1.0234447717666626, + "learning_rate": 4.443150754277615e-05, + "loss": 0.9109, + "step": 10987 + }, + { + "epoch": 3.335255729245713, + "grad_norm": 0.6312156319618225, + "learning_rate": 4.442644527690594e-05, + "loss": 1.1263, + "step": 10988 + }, + { + "epoch": 3.335559265442404, + "grad_norm": 1.0083098411560059, + "learning_rate": 4.442138301103574e-05, + "loss": 1.2425, + "step": 10989 + }, + { + "epoch": 3.3358628016390957, + "grad_norm": 0.8865886926651001, + "learning_rate": 4.441632074516554e-05, + "loss": 0.4524, + "step": 10990 + }, + { + "epoch": 3.336166337835787, + "grad_norm": 1.2328208684921265, + "learning_rate": 4.441125847929534e-05, + "loss": 0.6965, + "step": 10991 + }, + { + "epoch": 3.3364698740324785, + "grad_norm": 1.1210994720458984, + "learning_rate": 4.4406196213425134e-05, + "loss": 0.8978, + "step": 10992 + }, + { + "epoch": 3.3367734102291697, + "grad_norm": 1.1154487133026123, + "learning_rate": 4.440113394755493e-05, + "loss": 0.5072, + "step": 10993 + }, + { + "epoch": 3.3370769464258614, + "grad_norm": 0.9243950247764587, + "learning_rate": 4.4396071681684724e-05, + "loss": 1.122, + "step": 10994 + }, + { + "epoch": 3.3373804826225526, + "grad_norm": 0.94225013256073, + "learning_rate": 4.439100941581452e-05, + "loss": 0.8198, + "step": 10995 + }, + { + "epoch": 3.3376840188192443, + "grad_norm": 1.1107290983200073, + "learning_rate": 4.438594714994432e-05, + "loss": 0.7648, + "step": 10996 + }, + { + "epoch": 3.3379875550159355, + "grad_norm": 1.0457758903503418, + "learning_rate": 4.4380884884074115e-05, + "loss": 0.6871, + "step": 10997 + }, + { + "epoch": 3.338291091212627, + "grad_norm": 0.9543755650520325, + "learning_rate": 4.437582261820391e-05, + "loss": 0.3973, + "step": 10998 + }, + { + "epoch": 3.338594627409319, + "grad_norm": 1.2076659202575684, + "learning_rate": 4.4370760352333705e-05, + "loss": 0.6717, + "step": 10999 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.8962266445159912, + "learning_rate": 4.4365698086463506e-05, + "loss": 1.1488, + "step": 11000 + }, + { + "epoch": 3.3392016998027017, + "grad_norm": 1.1356310844421387, + "learning_rate": 4.43606358205933e-05, + "loss": 1.0305, + "step": 11001 + }, + { + "epoch": 3.339505235999393, + "grad_norm": 1.086762547492981, + "learning_rate": 4.4355573554723096e-05, + "loss": 0.9823, + "step": 11002 + }, + { + "epoch": 3.3398087721960845, + "grad_norm": 0.8191635608673096, + "learning_rate": 4.435051128885289e-05, + "loss": 0.956, + "step": 11003 + }, + { + "epoch": 3.3401123083927757, + "grad_norm": 0.9153105020523071, + "learning_rate": 4.4345449022982685e-05, + "loss": 0.8046, + "step": 11004 + }, + { + "epoch": 3.3404158445894674, + "grad_norm": 0.981857419013977, + "learning_rate": 4.434038675711249e-05, + "loss": 0.9211, + "step": 11005 + }, + { + "epoch": 3.3407193807861586, + "grad_norm": 1.1695274114608765, + "learning_rate": 4.433532449124228e-05, + "loss": 0.9371, + "step": 11006 + }, + { + "epoch": 3.3410229169828503, + "grad_norm": 1.0738000869750977, + "learning_rate": 4.4330262225372076e-05, + "loss": 0.6391, + "step": 11007 + }, + { + "epoch": 3.3413264531795415, + "grad_norm": 0.9377501606941223, + "learning_rate": 4.432519995950187e-05, + "loss": 1.2493, + "step": 11008 + }, + { + "epoch": 3.341629989376233, + "grad_norm": 1.0371283292770386, + "learning_rate": 4.4320137693631666e-05, + "loss": 0.9155, + "step": 11009 + }, + { + "epoch": 3.341933525572925, + "grad_norm": 1.0268205404281616, + "learning_rate": 4.431507542776147e-05, + "loss": 1.0034, + "step": 11010 + }, + { + "epoch": 3.342237061769616, + "grad_norm": 0.6147589683532715, + "learning_rate": 4.431001316189127e-05, + "loss": 0.658, + "step": 11011 + }, + { + "epoch": 3.3425405979663076, + "grad_norm": 1.0454167127609253, + "learning_rate": 4.4304950896021064e-05, + "loss": 0.4701, + "step": 11012 + }, + { + "epoch": 3.342844134162999, + "grad_norm": 1.017411231994629, + "learning_rate": 4.429988863015086e-05, + "loss": 0.8054, + "step": 11013 + }, + { + "epoch": 3.3431476703596905, + "grad_norm": 0.7506675720214844, + "learning_rate": 4.4294826364280654e-05, + "loss": 0.6455, + "step": 11014 + }, + { + "epoch": 3.3434512065563817, + "grad_norm": 0.9943332672119141, + "learning_rate": 4.4289764098410455e-05, + "loss": 0.9629, + "step": 11015 + }, + { + "epoch": 3.3437547427530734, + "grad_norm": 0.8806918859481812, + "learning_rate": 4.428470183254025e-05, + "loss": 0.9913, + "step": 11016 + }, + { + "epoch": 3.3440582789497646, + "grad_norm": 1.1383310556411743, + "learning_rate": 4.4279639566670045e-05, + "loss": 0.5983, + "step": 11017 + }, + { + "epoch": 3.3443618151464563, + "grad_norm": 1.263742446899414, + "learning_rate": 4.427457730079984e-05, + "loss": 0.764, + "step": 11018 + }, + { + "epoch": 3.3446653513431475, + "grad_norm": 1.1885840892791748, + "learning_rate": 4.4269515034929634e-05, + "loss": 1.2828, + "step": 11019 + }, + { + "epoch": 3.344968887539839, + "grad_norm": 0.9624831676483154, + "learning_rate": 4.4264452769059436e-05, + "loss": 0.615, + "step": 11020 + }, + { + "epoch": 3.3452724237365308, + "grad_norm": 1.14090895652771, + "learning_rate": 4.425939050318923e-05, + "loss": 0.6973, + "step": 11021 + }, + { + "epoch": 3.345575959933222, + "grad_norm": 1.0292391777038574, + "learning_rate": 4.4254328237319025e-05, + "loss": 0.854, + "step": 11022 + }, + { + "epoch": 3.3458794961299136, + "grad_norm": 1.134187936782837, + "learning_rate": 4.424926597144882e-05, + "loss": 0.8914, + "step": 11023 + }, + { + "epoch": 3.346183032326605, + "grad_norm": 0.8634889721870422, + "learning_rate": 4.424420370557862e-05, + "loss": 0.8837, + "step": 11024 + }, + { + "epoch": 3.3464865685232965, + "grad_norm": 1.0344593524932861, + "learning_rate": 4.4239141439708417e-05, + "loss": 0.9993, + "step": 11025 + }, + { + "epoch": 3.3467901047199877, + "grad_norm": 0.7718467116355896, + "learning_rate": 4.423407917383821e-05, + "loss": 0.9805, + "step": 11026 + }, + { + "epoch": 3.3470936409166794, + "grad_norm": 1.182938575744629, + "learning_rate": 4.4229016907968006e-05, + "loss": 0.9841, + "step": 11027 + }, + { + "epoch": 3.3473971771133706, + "grad_norm": 1.041308879852295, + "learning_rate": 4.42239546420978e-05, + "loss": 1.0446, + "step": 11028 + }, + { + "epoch": 3.3477007133100622, + "grad_norm": 1.0024820566177368, + "learning_rate": 4.42188923762276e-05, + "loss": 1.1448, + "step": 11029 + }, + { + "epoch": 3.3480042495067535, + "grad_norm": 0.7676887512207031, + "learning_rate": 4.42138301103574e-05, + "loss": 0.7913, + "step": 11030 + }, + { + "epoch": 3.348307785703445, + "grad_norm": 1.1021027565002441, + "learning_rate": 4.420876784448719e-05, + "loss": 0.9143, + "step": 11031 + }, + { + "epoch": 3.3486113219001368, + "grad_norm": 0.9699108600616455, + "learning_rate": 4.420370557861699e-05, + "loss": 0.7889, + "step": 11032 + }, + { + "epoch": 3.348914858096828, + "grad_norm": 1.1651097536087036, + "learning_rate": 4.419864331274679e-05, + "loss": 0.6009, + "step": 11033 + }, + { + "epoch": 3.3492183942935196, + "grad_norm": 0.9559666514396667, + "learning_rate": 4.419358104687658e-05, + "loss": 1.232, + "step": 11034 + }, + { + "epoch": 3.349521930490211, + "grad_norm": 0.7897623777389526, + "learning_rate": 4.4188518781006385e-05, + "loss": 1.4642, + "step": 11035 + }, + { + "epoch": 3.3498254666869025, + "grad_norm": 0.9410789012908936, + "learning_rate": 4.418345651513618e-05, + "loss": 1.1745, + "step": 11036 + }, + { + "epoch": 3.3501290028835937, + "grad_norm": 1.1459516286849976, + "learning_rate": 4.4178394249265974e-05, + "loss": 0.9194, + "step": 11037 + }, + { + "epoch": 3.3504325390802854, + "grad_norm": 0.5954509973526001, + "learning_rate": 4.417333198339577e-05, + "loss": 0.2752, + "step": 11038 + }, + { + "epoch": 3.3507360752769766, + "grad_norm": 0.8633032441139221, + "learning_rate": 4.416826971752557e-05, + "loss": 0.6146, + "step": 11039 + }, + { + "epoch": 3.3510396114736682, + "grad_norm": 1.1020350456237793, + "learning_rate": 4.4163207451655366e-05, + "loss": 0.9846, + "step": 11040 + }, + { + "epoch": 3.3513431476703595, + "grad_norm": 0.8769760727882385, + "learning_rate": 4.415814518578516e-05, + "loss": 1.1434, + "step": 11041 + }, + { + "epoch": 3.351646683867051, + "grad_norm": 0.7439713478088379, + "learning_rate": 4.4153082919914955e-05, + "loss": 1.014, + "step": 11042 + }, + { + "epoch": 3.3519502200637428, + "grad_norm": 0.9776040315628052, + "learning_rate": 4.414802065404475e-05, + "loss": 1.1007, + "step": 11043 + }, + { + "epoch": 3.352253756260434, + "grad_norm": 1.124556303024292, + "learning_rate": 4.414295838817455e-05, + "loss": 0.9957, + "step": 11044 + }, + { + "epoch": 3.3525572924571256, + "grad_norm": 0.9449612498283386, + "learning_rate": 4.4137896122304346e-05, + "loss": 0.734, + "step": 11045 + }, + { + "epoch": 3.352860828653817, + "grad_norm": 1.0150214433670044, + "learning_rate": 4.413283385643414e-05, + "loss": 1.0613, + "step": 11046 + }, + { + "epoch": 3.3531643648505085, + "grad_norm": 1.0167527198791504, + "learning_rate": 4.4127771590563936e-05, + "loss": 0.764, + "step": 11047 + }, + { + "epoch": 3.3534679010471997, + "grad_norm": 1.2588480710983276, + "learning_rate": 4.412270932469373e-05, + "loss": 0.69, + "step": 11048 + }, + { + "epoch": 3.3537714372438914, + "grad_norm": 1.2376141548156738, + "learning_rate": 4.411764705882353e-05, + "loss": 0.8094, + "step": 11049 + }, + { + "epoch": 3.354074973440583, + "grad_norm": 0.7903802394866943, + "learning_rate": 4.411258479295333e-05, + "loss": 0.6127, + "step": 11050 + }, + { + "epoch": 3.3543785096372742, + "grad_norm": 1.0813982486724854, + "learning_rate": 4.410752252708312e-05, + "loss": 0.7652, + "step": 11051 + }, + { + "epoch": 3.3546820458339655, + "grad_norm": 1.411493182182312, + "learning_rate": 4.410246026121292e-05, + "loss": 0.6341, + "step": 11052 + }, + { + "epoch": 3.354985582030657, + "grad_norm": 1.0332156419754028, + "learning_rate": 4.409739799534272e-05, + "loss": 1.0563, + "step": 11053 + }, + { + "epoch": 3.3552891182273488, + "grad_norm": 0.8322957754135132, + "learning_rate": 4.409233572947251e-05, + "loss": 1.5432, + "step": 11054 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.998724639415741, + "learning_rate": 4.4087273463602315e-05, + "loss": 0.9778, + "step": 11055 + }, + { + "epoch": 3.3558961906207316, + "grad_norm": 1.0323874950408936, + "learning_rate": 4.408221119773211e-05, + "loss": 0.8895, + "step": 11056 + }, + { + "epoch": 3.356199726817423, + "grad_norm": 0.9936960935592651, + "learning_rate": 4.4077148931861904e-05, + "loss": 1.1114, + "step": 11057 + }, + { + "epoch": 3.3565032630141145, + "grad_norm": 0.9939408898353577, + "learning_rate": 4.40720866659917e-05, + "loss": 1.0038, + "step": 11058 + }, + { + "epoch": 3.3568067992108057, + "grad_norm": 1.0545432567596436, + "learning_rate": 4.40670244001215e-05, + "loss": 0.7668, + "step": 11059 + }, + { + "epoch": 3.3571103354074974, + "grad_norm": 0.936231791973114, + "learning_rate": 4.4061962134251295e-05, + "loss": 0.6953, + "step": 11060 + }, + { + "epoch": 3.357413871604189, + "grad_norm": 0.9481295943260193, + "learning_rate": 4.405689986838109e-05, + "loss": 1.3818, + "step": 11061 + }, + { + "epoch": 3.3577174078008802, + "grad_norm": 0.9908595681190491, + "learning_rate": 4.4051837602510885e-05, + "loss": 0.6949, + "step": 11062 + }, + { + "epoch": 3.358020943997572, + "grad_norm": 1.2029969692230225, + "learning_rate": 4.4046775336640687e-05, + "loss": 0.7447, + "step": 11063 + }, + { + "epoch": 3.358324480194263, + "grad_norm": 0.9881142973899841, + "learning_rate": 4.404171307077048e-05, + "loss": 0.9445, + "step": 11064 + }, + { + "epoch": 3.3586280163909548, + "grad_norm": 0.791097104549408, + "learning_rate": 4.4036650804900276e-05, + "loss": 1.2178, + "step": 11065 + }, + { + "epoch": 3.358931552587646, + "grad_norm": 0.5514218211174011, + "learning_rate": 4.403158853903007e-05, + "loss": 0.8756, + "step": 11066 + }, + { + "epoch": 3.3592350887843376, + "grad_norm": 1.1574491262435913, + "learning_rate": 4.4026526273159866e-05, + "loss": 0.6753, + "step": 11067 + }, + { + "epoch": 3.359538624981029, + "grad_norm": 0.9505230188369751, + "learning_rate": 4.402146400728967e-05, + "loss": 0.8316, + "step": 11068 + }, + { + "epoch": 3.3598421611777205, + "grad_norm": 1.2203385829925537, + "learning_rate": 4.401640174141946e-05, + "loss": 0.8389, + "step": 11069 + }, + { + "epoch": 3.3601456973744117, + "grad_norm": 1.1020116806030273, + "learning_rate": 4.401133947554926e-05, + "loss": 0.5493, + "step": 11070 + }, + { + "epoch": 3.3604492335711034, + "grad_norm": 0.826471745967865, + "learning_rate": 4.400627720967905e-05, + "loss": 0.8902, + "step": 11071 + }, + { + "epoch": 3.360752769767795, + "grad_norm": 0.9305258989334106, + "learning_rate": 4.4001214943808846e-05, + "loss": 1.1107, + "step": 11072 + }, + { + "epoch": 3.3610563059644862, + "grad_norm": 0.9449988007545471, + "learning_rate": 4.399615267793865e-05, + "loss": 0.8012, + "step": 11073 + }, + { + "epoch": 3.361359842161178, + "grad_norm": 1.1232578754425049, + "learning_rate": 4.399109041206844e-05, + "loss": 0.8461, + "step": 11074 + }, + { + "epoch": 3.361663378357869, + "grad_norm": 0.9352847337722778, + "learning_rate": 4.398602814619824e-05, + "loss": 0.9987, + "step": 11075 + }, + { + "epoch": 3.3619669145545608, + "grad_norm": 0.8257738351821899, + "learning_rate": 4.398096588032803e-05, + "loss": 0.6822, + "step": 11076 + }, + { + "epoch": 3.362270450751252, + "grad_norm": 0.7710626721382141, + "learning_rate": 4.3975903614457834e-05, + "loss": 0.7791, + "step": 11077 + }, + { + "epoch": 3.3625739869479436, + "grad_norm": 0.8186991214752197, + "learning_rate": 4.3970841348587636e-05, + "loss": 0.7544, + "step": 11078 + }, + { + "epoch": 3.362877523144635, + "grad_norm": 1.0076688528060913, + "learning_rate": 4.396577908271743e-05, + "loss": 1.0359, + "step": 11079 + }, + { + "epoch": 3.3631810593413265, + "grad_norm": 1.193880558013916, + "learning_rate": 4.3960716816847225e-05, + "loss": 0.5912, + "step": 11080 + }, + { + "epoch": 3.3634845955380177, + "grad_norm": 1.0114336013793945, + "learning_rate": 4.395565455097702e-05, + "loss": 1.1997, + "step": 11081 + }, + { + "epoch": 3.3637881317347094, + "grad_norm": 0.9775505661964417, + "learning_rate": 4.3950592285106815e-05, + "loss": 1.0146, + "step": 11082 + }, + { + "epoch": 3.364091667931401, + "grad_norm": 1.0274958610534668, + "learning_rate": 4.3945530019236616e-05, + "loss": 0.6688, + "step": 11083 + }, + { + "epoch": 3.3643952041280922, + "grad_norm": 1.319023609161377, + "learning_rate": 4.394046775336641e-05, + "loss": 1.0431, + "step": 11084 + }, + { + "epoch": 3.364698740324784, + "grad_norm": 1.1133911609649658, + "learning_rate": 4.3935405487496206e-05, + "loss": 0.6877, + "step": 11085 + }, + { + "epoch": 3.365002276521475, + "grad_norm": 0.9162386655807495, + "learning_rate": 4.3930343221626e-05, + "loss": 0.8984, + "step": 11086 + }, + { + "epoch": 3.3653058127181668, + "grad_norm": 1.0204176902770996, + "learning_rate": 4.3925280955755796e-05, + "loss": 0.4898, + "step": 11087 + }, + { + "epoch": 3.365609348914858, + "grad_norm": 0.9278808236122131, + "learning_rate": 4.39202186898856e-05, + "loss": 1.3503, + "step": 11088 + }, + { + "epoch": 3.3659128851115496, + "grad_norm": 1.276100754737854, + "learning_rate": 4.391515642401539e-05, + "loss": 1.0284, + "step": 11089 + }, + { + "epoch": 3.366216421308241, + "grad_norm": 0.8916330933570862, + "learning_rate": 4.391009415814519e-05, + "loss": 0.8058, + "step": 11090 + }, + { + "epoch": 3.3665199575049325, + "grad_norm": 0.9330226182937622, + "learning_rate": 4.390503189227498e-05, + "loss": 1.2123, + "step": 11091 + }, + { + "epoch": 3.3668234937016237, + "grad_norm": 0.9695453643798828, + "learning_rate": 4.389996962640478e-05, + "loss": 1.0213, + "step": 11092 + }, + { + "epoch": 3.3671270298983154, + "grad_norm": 1.1185232400894165, + "learning_rate": 4.389490736053458e-05, + "loss": 0.6356, + "step": 11093 + }, + { + "epoch": 3.367430566095007, + "grad_norm": 1.0594892501831055, + "learning_rate": 4.388984509466437e-05, + "loss": 1.1838, + "step": 11094 + }, + { + "epoch": 3.3677341022916982, + "grad_norm": 1.0396744012832642, + "learning_rate": 4.388478282879417e-05, + "loss": 0.9601, + "step": 11095 + }, + { + "epoch": 3.36803763848839, + "grad_norm": 1.0960280895233154, + "learning_rate": 4.387972056292396e-05, + "loss": 1.0603, + "step": 11096 + }, + { + "epoch": 3.368341174685081, + "grad_norm": 0.9547663331031799, + "learning_rate": 4.3874658297053764e-05, + "loss": 0.4383, + "step": 11097 + }, + { + "epoch": 3.3686447108817728, + "grad_norm": 1.1598211526870728, + "learning_rate": 4.386959603118356e-05, + "loss": 1.1376, + "step": 11098 + }, + { + "epoch": 3.368948247078464, + "grad_norm": 0.9284223318099976, + "learning_rate": 4.386453376531335e-05, + "loss": 1.3279, + "step": 11099 + }, + { + "epoch": 3.3692517832751556, + "grad_norm": 1.1557056903839111, + "learning_rate": 4.3859471499443155e-05, + "loss": 1.2298, + "step": 11100 + }, + { + "epoch": 3.369555319471847, + "grad_norm": 1.0002943277359009, + "learning_rate": 4.385440923357295e-05, + "loss": 0.9405, + "step": 11101 + }, + { + "epoch": 3.3698588556685385, + "grad_norm": 1.1648029088974, + "learning_rate": 4.384934696770275e-05, + "loss": 0.7776, + "step": 11102 + }, + { + "epoch": 3.3701623918652297, + "grad_norm": 0.9613409638404846, + "learning_rate": 4.3844284701832546e-05, + "loss": 1.0053, + "step": 11103 + }, + { + "epoch": 3.3704659280619214, + "grad_norm": 1.105421781539917, + "learning_rate": 4.383922243596234e-05, + "loss": 0.8279, + "step": 11104 + }, + { + "epoch": 3.370769464258613, + "grad_norm": 1.0452386140823364, + "learning_rate": 4.3834160170092136e-05, + "loss": 0.5799, + "step": 11105 + }, + { + "epoch": 3.3710730004553042, + "grad_norm": 0.9229490160942078, + "learning_rate": 4.382909790422193e-05, + "loss": 1.0908, + "step": 11106 + }, + { + "epoch": 3.371376536651996, + "grad_norm": 1.1365147829055786, + "learning_rate": 4.382403563835173e-05, + "loss": 0.8401, + "step": 11107 + }, + { + "epoch": 3.371680072848687, + "grad_norm": 0.9556456804275513, + "learning_rate": 4.381897337248153e-05, + "loss": 1.2632, + "step": 11108 + }, + { + "epoch": 3.3719836090453787, + "grad_norm": 1.2630316019058228, + "learning_rate": 4.381391110661132e-05, + "loss": 1.0492, + "step": 11109 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 1.1264828443527222, + "learning_rate": 4.3808848840741116e-05, + "loss": 0.7269, + "step": 11110 + }, + { + "epoch": 3.3725906814387616, + "grad_norm": 1.0559065341949463, + "learning_rate": 4.380378657487091e-05, + "loss": 1.232, + "step": 11111 + }, + { + "epoch": 3.3728942176354533, + "grad_norm": 0.9610334634780884, + "learning_rate": 4.379872430900071e-05, + "loss": 1.1311, + "step": 11112 + }, + { + "epoch": 3.3731977538321445, + "grad_norm": 0.9011523723602295, + "learning_rate": 4.379366204313051e-05, + "loss": 0.7168, + "step": 11113 + }, + { + "epoch": 3.3735012900288357, + "grad_norm": 1.1094415187835693, + "learning_rate": 4.37885997772603e-05, + "loss": 0.6192, + "step": 11114 + }, + { + "epoch": 3.3738048262255274, + "grad_norm": 1.2744410037994385, + "learning_rate": 4.37835375113901e-05, + "loss": 0.8786, + "step": 11115 + }, + { + "epoch": 3.374108362422219, + "grad_norm": 1.0969855785369873, + "learning_rate": 4.37784752455199e-05, + "loss": 0.9117, + "step": 11116 + }, + { + "epoch": 3.37441189861891, + "grad_norm": 1.1921651363372803, + "learning_rate": 4.3773412979649694e-05, + "loss": 0.9126, + "step": 11117 + }, + { + "epoch": 3.374715434815602, + "grad_norm": 0.9259141683578491, + "learning_rate": 4.376835071377949e-05, + "loss": 1.2737, + "step": 11118 + }, + { + "epoch": 3.375018971012293, + "grad_norm": 1.110809326171875, + "learning_rate": 4.376328844790928e-05, + "loss": 0.6586, + "step": 11119 + }, + { + "epoch": 3.3753225072089847, + "grad_norm": 1.0401978492736816, + "learning_rate": 4.375822618203908e-05, + "loss": 1.0633, + "step": 11120 + }, + { + "epoch": 3.375626043405676, + "grad_norm": 1.0232112407684326, + "learning_rate": 4.375316391616888e-05, + "loss": 0.9139, + "step": 11121 + }, + { + "epoch": 3.3759295796023676, + "grad_norm": 1.1789039373397827, + "learning_rate": 4.374810165029868e-05, + "loss": 0.6419, + "step": 11122 + }, + { + "epoch": 3.3762331157990593, + "grad_norm": 1.1436289548873901, + "learning_rate": 4.3743039384428476e-05, + "loss": 0.8462, + "step": 11123 + }, + { + "epoch": 3.3765366519957505, + "grad_norm": 0.8322597146034241, + "learning_rate": 4.373797711855827e-05, + "loss": 1.3864, + "step": 11124 + }, + { + "epoch": 3.376840188192442, + "grad_norm": 0.7684193253517151, + "learning_rate": 4.3732914852688065e-05, + "loss": 1.3969, + "step": 11125 + }, + { + "epoch": 3.3771437243891334, + "grad_norm": 0.9189038276672363, + "learning_rate": 4.372785258681786e-05, + "loss": 0.6913, + "step": 11126 + }, + { + "epoch": 3.377447260585825, + "grad_norm": 0.8024885654449463, + "learning_rate": 4.372279032094766e-05, + "loss": 1.4167, + "step": 11127 + }, + { + "epoch": 3.377750796782516, + "grad_norm": 1.0043259859085083, + "learning_rate": 4.371772805507746e-05, + "loss": 1.1966, + "step": 11128 + }, + { + "epoch": 3.378054332979208, + "grad_norm": 0.9576742053031921, + "learning_rate": 4.371266578920725e-05, + "loss": 0.9612, + "step": 11129 + }, + { + "epoch": 3.378357869175899, + "grad_norm": 1.0342203378677368, + "learning_rate": 4.3707603523337046e-05, + "loss": 1.273, + "step": 11130 + }, + { + "epoch": 3.3786614053725907, + "grad_norm": 1.0702886581420898, + "learning_rate": 4.370254125746685e-05, + "loss": 0.9742, + "step": 11131 + }, + { + "epoch": 3.378964941569282, + "grad_norm": 1.0954909324645996, + "learning_rate": 4.369747899159664e-05, + "loss": 0.3435, + "step": 11132 + }, + { + "epoch": 3.3792684777659736, + "grad_norm": 0.9873117804527283, + "learning_rate": 4.369241672572644e-05, + "loss": 0.9571, + "step": 11133 + }, + { + "epoch": 3.3795720139626653, + "grad_norm": 1.1948156356811523, + "learning_rate": 4.368735445985623e-05, + "loss": 0.7143, + "step": 11134 + }, + { + "epoch": 3.3798755501593565, + "grad_norm": 0.8648605942726135, + "learning_rate": 4.368229219398603e-05, + "loss": 0.5641, + "step": 11135 + }, + { + "epoch": 3.380179086356048, + "grad_norm": 1.136048674583435, + "learning_rate": 4.367722992811583e-05, + "loss": 0.8459, + "step": 11136 + }, + { + "epoch": 3.3804826225527393, + "grad_norm": 0.9702625274658203, + "learning_rate": 4.367216766224562e-05, + "loss": 0.4431, + "step": 11137 + }, + { + "epoch": 3.380786158749431, + "grad_norm": 0.8680826425552368, + "learning_rate": 4.366710539637542e-05, + "loss": 0.9478, + "step": 11138 + }, + { + "epoch": 3.381089694946122, + "grad_norm": 0.9408009052276611, + "learning_rate": 4.366204313050521e-05, + "loss": 0.8907, + "step": 11139 + }, + { + "epoch": 3.381393231142814, + "grad_norm": 1.1215790510177612, + "learning_rate": 4.365698086463501e-05, + "loss": 1.0605, + "step": 11140 + }, + { + "epoch": 3.381696767339505, + "grad_norm": 1.022484302520752, + "learning_rate": 4.365191859876481e-05, + "loss": 1.1152, + "step": 11141 + }, + { + "epoch": 3.3820003035361967, + "grad_norm": 1.187207579612732, + "learning_rate": 4.3646856332894604e-05, + "loss": 0.828, + "step": 11142 + }, + { + "epoch": 3.382303839732888, + "grad_norm": 0.8248500227928162, + "learning_rate": 4.36417940670244e-05, + "loss": 0.7726, + "step": 11143 + }, + { + "epoch": 3.3826073759295796, + "grad_norm": 1.193074107170105, + "learning_rate": 4.36367318011542e-05, + "loss": 0.5612, + "step": 11144 + }, + { + "epoch": 3.3829109121262713, + "grad_norm": 1.0054807662963867, + "learning_rate": 4.3631669535283995e-05, + "loss": 1.0689, + "step": 11145 + }, + { + "epoch": 3.3832144483229625, + "grad_norm": 0.9676352143287659, + "learning_rate": 4.36266072694138e-05, + "loss": 0.8216, + "step": 11146 + }, + { + "epoch": 3.383517984519654, + "grad_norm": 1.026658535003662, + "learning_rate": 4.362154500354359e-05, + "loss": 1.2398, + "step": 11147 + }, + { + "epoch": 3.3838215207163453, + "grad_norm": 1.2132176160812378, + "learning_rate": 4.3616482737673386e-05, + "loss": 0.7776, + "step": 11148 + }, + { + "epoch": 3.384125056913037, + "grad_norm": 1.2175756692886353, + "learning_rate": 4.361142047180318e-05, + "loss": 0.7543, + "step": 11149 + }, + { + "epoch": 3.384428593109728, + "grad_norm": 1.1441831588745117, + "learning_rate": 4.3606358205932976e-05, + "loss": 0.8551, + "step": 11150 + }, + { + "epoch": 3.38473212930642, + "grad_norm": 1.1139713525772095, + "learning_rate": 4.360129594006278e-05, + "loss": 0.9502, + "step": 11151 + }, + { + "epoch": 3.385035665503111, + "grad_norm": 0.969123899936676, + "learning_rate": 4.359623367419257e-05, + "loss": 0.652, + "step": 11152 + }, + { + "epoch": 3.3853392016998027, + "grad_norm": 0.82719886302948, + "learning_rate": 4.359117140832237e-05, + "loss": 1.5107, + "step": 11153 + }, + { + "epoch": 3.385642737896494, + "grad_norm": 1.0074968338012695, + "learning_rate": 4.358610914245216e-05, + "loss": 0.5629, + "step": 11154 + }, + { + "epoch": 3.3859462740931856, + "grad_norm": 1.013174295425415, + "learning_rate": 4.3581046876581964e-05, + "loss": 0.5965, + "step": 11155 + }, + { + "epoch": 3.3862498102898773, + "grad_norm": 1.0356998443603516, + "learning_rate": 4.357598461071176e-05, + "loss": 0.7696, + "step": 11156 + }, + { + "epoch": 3.3865533464865685, + "grad_norm": 1.377439022064209, + "learning_rate": 4.357092234484155e-05, + "loss": 1.2339, + "step": 11157 + }, + { + "epoch": 3.38685688268326, + "grad_norm": 1.1538790464401245, + "learning_rate": 4.356586007897135e-05, + "loss": 0.65, + "step": 11158 + }, + { + "epoch": 3.3871604188799513, + "grad_norm": 1.1675184965133667, + "learning_rate": 4.356079781310114e-05, + "loss": 1.106, + "step": 11159 + }, + { + "epoch": 3.387463955076643, + "grad_norm": 1.1323292255401611, + "learning_rate": 4.3555735547230944e-05, + "loss": 0.7097, + "step": 11160 + }, + { + "epoch": 3.387767491273334, + "grad_norm": 1.1334291696548462, + "learning_rate": 4.355067328136074e-05, + "loss": 0.9728, + "step": 11161 + }, + { + "epoch": 3.388071027470026, + "grad_norm": 0.9110361337661743, + "learning_rate": 4.3545611015490534e-05, + "loss": 0.3733, + "step": 11162 + }, + { + "epoch": 3.388374563666717, + "grad_norm": 0.9186685681343079, + "learning_rate": 4.354054874962033e-05, + "loss": 0.4137, + "step": 11163 + }, + { + "epoch": 3.3886780998634087, + "grad_norm": 1.0343124866485596, + "learning_rate": 4.3535486483750123e-05, + "loss": 1.1307, + "step": 11164 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 1.2049212455749512, + "learning_rate": 4.3530424217879925e-05, + "loss": 0.8328, + "step": 11165 + }, + { + "epoch": 3.3892851722567916, + "grad_norm": 0.8319340348243713, + "learning_rate": 4.352536195200972e-05, + "loss": 0.8472, + "step": 11166 + }, + { + "epoch": 3.3895887084534833, + "grad_norm": 1.0677204132080078, + "learning_rate": 4.352029968613952e-05, + "loss": 0.7299, + "step": 11167 + }, + { + "epoch": 3.3898922446501745, + "grad_norm": 1.087017297744751, + "learning_rate": 4.3515237420269316e-05, + "loss": 0.9764, + "step": 11168 + }, + { + "epoch": 3.390195780846866, + "grad_norm": 1.0864224433898926, + "learning_rate": 4.351017515439911e-05, + "loss": 0.7262, + "step": 11169 + }, + { + "epoch": 3.3904993170435573, + "grad_norm": 1.0477935075759888, + "learning_rate": 4.350511288852891e-05, + "loss": 0.998, + "step": 11170 + }, + { + "epoch": 3.390802853240249, + "grad_norm": 0.9894422292709351, + "learning_rate": 4.350005062265871e-05, + "loss": 0.99, + "step": 11171 + }, + { + "epoch": 3.39110638943694, + "grad_norm": 0.948259174823761, + "learning_rate": 4.34949883567885e-05, + "loss": 0.8285, + "step": 11172 + }, + { + "epoch": 3.391409925633632, + "grad_norm": 0.8385151028633118, + "learning_rate": 4.34899260909183e-05, + "loss": 0.6918, + "step": 11173 + }, + { + "epoch": 3.391713461830323, + "grad_norm": 1.1372063159942627, + "learning_rate": 4.348486382504809e-05, + "loss": 0.8636, + "step": 11174 + }, + { + "epoch": 3.3920169980270147, + "grad_norm": 0.973361074924469, + "learning_rate": 4.347980155917789e-05, + "loss": 0.6895, + "step": 11175 + }, + { + "epoch": 3.392320534223706, + "grad_norm": 1.0254693031311035, + "learning_rate": 4.347473929330769e-05, + "loss": 0.9941, + "step": 11176 + }, + { + "epoch": 3.3926240704203976, + "grad_norm": 1.0836663246154785, + "learning_rate": 4.346967702743748e-05, + "loss": 0.917, + "step": 11177 + }, + { + "epoch": 3.3929276066170893, + "grad_norm": 1.1260489225387573, + "learning_rate": 4.346461476156728e-05, + "loss": 0.871, + "step": 11178 + }, + { + "epoch": 3.3932311428137805, + "grad_norm": 0.9051371216773987, + "learning_rate": 4.345955249569707e-05, + "loss": 1.1206, + "step": 11179 + }, + { + "epoch": 3.393534679010472, + "grad_norm": 0.9649965167045593, + "learning_rate": 4.3454490229826874e-05, + "loss": 0.8965, + "step": 11180 + }, + { + "epoch": 3.3938382152071633, + "grad_norm": 1.229691982269287, + "learning_rate": 4.344942796395667e-05, + "loss": 0.5088, + "step": 11181 + }, + { + "epoch": 3.394141751403855, + "grad_norm": 0.92978435754776, + "learning_rate": 4.3444365698086464e-05, + "loss": 1.1677, + "step": 11182 + }, + { + "epoch": 3.394445287600546, + "grad_norm": 1.1464147567749023, + "learning_rate": 4.343930343221626e-05, + "loss": 0.9226, + "step": 11183 + }, + { + "epoch": 3.394748823797238, + "grad_norm": 1.2325044870376587, + "learning_rate": 4.343424116634606e-05, + "loss": 0.8476, + "step": 11184 + }, + { + "epoch": 3.3950523599939295, + "grad_norm": 1.0228196382522583, + "learning_rate": 4.3429178900475855e-05, + "loss": 0.781, + "step": 11185 + }, + { + "epoch": 3.3953558961906207, + "grad_norm": 0.8366285562515259, + "learning_rate": 4.342411663460565e-05, + "loss": 0.794, + "step": 11186 + }, + { + "epoch": 3.395659432387312, + "grad_norm": 1.1898764371871948, + "learning_rate": 4.3419054368735444e-05, + "loss": 0.6966, + "step": 11187 + }, + { + "epoch": 3.3959629685840036, + "grad_norm": 1.014097809791565, + "learning_rate": 4.341399210286524e-05, + "loss": 0.9985, + "step": 11188 + }, + { + "epoch": 3.3962665047806953, + "grad_norm": 1.0949811935424805, + "learning_rate": 4.340892983699504e-05, + "loss": 1.0451, + "step": 11189 + }, + { + "epoch": 3.3965700409773865, + "grad_norm": 0.9968173503875732, + "learning_rate": 4.340386757112484e-05, + "loss": 1.0004, + "step": 11190 + }, + { + "epoch": 3.396873577174078, + "grad_norm": 0.6825324296951294, + "learning_rate": 4.339880530525464e-05, + "loss": 0.4084, + "step": 11191 + }, + { + "epoch": 3.3971771133707693, + "grad_norm": 1.0030479431152344, + "learning_rate": 4.339374303938443e-05, + "loss": 1.2079, + "step": 11192 + }, + { + "epoch": 3.397480649567461, + "grad_norm": 1.1695003509521484, + "learning_rate": 4.338868077351423e-05, + "loss": 0.7655, + "step": 11193 + }, + { + "epoch": 3.397784185764152, + "grad_norm": 1.2651556730270386, + "learning_rate": 4.338361850764403e-05, + "loss": 0.905, + "step": 11194 + }, + { + "epoch": 3.398087721960844, + "grad_norm": 0.8903501033782959, + "learning_rate": 4.337855624177382e-05, + "loss": 1.4404, + "step": 11195 + }, + { + "epoch": 3.3983912581575355, + "grad_norm": 1.1483242511749268, + "learning_rate": 4.337349397590362e-05, + "loss": 1.0619, + "step": 11196 + }, + { + "epoch": 3.3986947943542267, + "grad_norm": 0.8944156765937805, + "learning_rate": 4.336843171003341e-05, + "loss": 1.4874, + "step": 11197 + }, + { + "epoch": 3.3989983305509184, + "grad_norm": 0.8962923288345337, + "learning_rate": 4.336336944416321e-05, + "loss": 1.2604, + "step": 11198 + }, + { + "epoch": 3.3993018667476096, + "grad_norm": 1.0435539484024048, + "learning_rate": 4.335830717829301e-05, + "loss": 0.8731, + "step": 11199 + }, + { + "epoch": 3.3996054029443012, + "grad_norm": 0.9906169772148132, + "learning_rate": 4.3353244912422804e-05, + "loss": 1.1067, + "step": 11200 + }, + { + "epoch": 3.3999089391409925, + "grad_norm": 0.9234976768493652, + "learning_rate": 4.33481826465526e-05, + "loss": 1.2027, + "step": 11201 + }, + { + "epoch": 3.400212475337684, + "grad_norm": 0.7002107501029968, + "learning_rate": 4.334312038068239e-05, + "loss": 1.0743, + "step": 11202 + }, + { + "epoch": 3.4005160115343753, + "grad_norm": 0.8271046876907349, + "learning_rate": 4.333805811481219e-05, + "loss": 0.8729, + "step": 11203 + }, + { + "epoch": 3.400819547731067, + "grad_norm": 1.0886693000793457, + "learning_rate": 4.333299584894199e-05, + "loss": 1.2209, + "step": 11204 + }, + { + "epoch": 3.401123083927758, + "grad_norm": 0.7729898691177368, + "learning_rate": 4.3327933583071785e-05, + "loss": 1.4791, + "step": 11205 + }, + { + "epoch": 3.40142662012445, + "grad_norm": 1.1772031784057617, + "learning_rate": 4.332287131720158e-05, + "loss": 0.9754, + "step": 11206 + }, + { + "epoch": 3.4017301563211415, + "grad_norm": 1.2348803281784058, + "learning_rate": 4.3317809051331374e-05, + "loss": 0.5648, + "step": 11207 + }, + { + "epoch": 3.4020336925178327, + "grad_norm": 0.9691221117973328, + "learning_rate": 4.3312746785461176e-05, + "loss": 1.0242, + "step": 11208 + }, + { + "epoch": 3.4023372287145244, + "grad_norm": 1.1430742740631104, + "learning_rate": 4.330768451959097e-05, + "loss": 0.84, + "step": 11209 + }, + { + "epoch": 3.4026407649112156, + "grad_norm": 0.7806965708732605, + "learning_rate": 4.3302622253720765e-05, + "loss": 0.6746, + "step": 11210 + }, + { + "epoch": 3.4029443011079072, + "grad_norm": 1.1952134370803833, + "learning_rate": 4.329755998785057e-05, + "loss": 0.9808, + "step": 11211 + }, + { + "epoch": 3.4032478373045985, + "grad_norm": 1.0283790826797485, + "learning_rate": 4.329249772198036e-05, + "loss": 1.033, + "step": 11212 + }, + { + "epoch": 3.40355137350129, + "grad_norm": 1.0666593313217163, + "learning_rate": 4.3287435456110156e-05, + "loss": 1.1154, + "step": 11213 + }, + { + "epoch": 3.4038549096979813, + "grad_norm": 0.7356647253036499, + "learning_rate": 4.328237319023996e-05, + "loss": 0.2142, + "step": 11214 + }, + { + "epoch": 3.404158445894673, + "grad_norm": 1.0589187145233154, + "learning_rate": 4.327731092436975e-05, + "loss": 1.2255, + "step": 11215 + }, + { + "epoch": 3.404461982091364, + "grad_norm": 1.1176382303237915, + "learning_rate": 4.327224865849955e-05, + "loss": 0.7987, + "step": 11216 + }, + { + "epoch": 3.404765518288056, + "grad_norm": 0.9496319890022278, + "learning_rate": 4.326718639262934e-05, + "loss": 1.2579, + "step": 11217 + }, + { + "epoch": 3.4050690544847475, + "grad_norm": 1.2868812084197998, + "learning_rate": 4.326212412675914e-05, + "loss": 0.9553, + "step": 11218 + }, + { + "epoch": 3.4053725906814387, + "grad_norm": 1.1212111711502075, + "learning_rate": 4.325706186088894e-05, + "loss": 0.9834, + "step": 11219 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.9784547090530396, + "learning_rate": 4.3251999595018734e-05, + "loss": 0.8669, + "step": 11220 + }, + { + "epoch": 3.4059796630748216, + "grad_norm": 1.1899213790893555, + "learning_rate": 4.324693732914853e-05, + "loss": 0.9752, + "step": 11221 + }, + { + "epoch": 3.4062831992715132, + "grad_norm": 1.0844616889953613, + "learning_rate": 4.324187506327832e-05, + "loss": 0.7197, + "step": 11222 + }, + { + "epoch": 3.4065867354682045, + "grad_norm": 0.797312319278717, + "learning_rate": 4.3236812797408125e-05, + "loss": 0.577, + "step": 11223 + }, + { + "epoch": 3.406890271664896, + "grad_norm": 0.6092471480369568, + "learning_rate": 4.323175053153792e-05, + "loss": 1.0937, + "step": 11224 + }, + { + "epoch": 3.4071938078615873, + "grad_norm": 1.0938823223114014, + "learning_rate": 4.3226688265667714e-05, + "loss": 0.9487, + "step": 11225 + }, + { + "epoch": 3.407497344058279, + "grad_norm": 1.173330545425415, + "learning_rate": 4.322162599979751e-05, + "loss": 1.0725, + "step": 11226 + }, + { + "epoch": 3.40780088025497, + "grad_norm": 0.9512575268745422, + "learning_rate": 4.3216563733927304e-05, + "loss": 0.3817, + "step": 11227 + }, + { + "epoch": 3.408104416451662, + "grad_norm": 0.8480756282806396, + "learning_rate": 4.3211501468057105e-05, + "loss": 0.3102, + "step": 11228 + }, + { + "epoch": 3.4084079526483535, + "grad_norm": 0.9716628193855286, + "learning_rate": 4.32064392021869e-05, + "loss": 1.3376, + "step": 11229 + }, + { + "epoch": 3.4087114888450447, + "grad_norm": 0.9838001132011414, + "learning_rate": 4.3201376936316695e-05, + "loss": 0.4546, + "step": 11230 + }, + { + "epoch": 3.4090150250417364, + "grad_norm": 1.0094945430755615, + "learning_rate": 4.319631467044649e-05, + "loss": 1.1361, + "step": 11231 + }, + { + "epoch": 3.4093185612384276, + "grad_norm": 1.0151097774505615, + "learning_rate": 4.3191252404576285e-05, + "loss": 1.2636, + "step": 11232 + }, + { + "epoch": 3.4096220974351192, + "grad_norm": 0.8380160331726074, + "learning_rate": 4.318619013870609e-05, + "loss": 1.0473, + "step": 11233 + }, + { + "epoch": 3.4099256336318104, + "grad_norm": 1.0491576194763184, + "learning_rate": 4.318112787283589e-05, + "loss": 0.6924, + "step": 11234 + }, + { + "epoch": 3.410229169828502, + "grad_norm": 1.0133543014526367, + "learning_rate": 4.317606560696568e-05, + "loss": 1.0603, + "step": 11235 + }, + { + "epoch": 3.4105327060251933, + "grad_norm": 0.9890870451927185, + "learning_rate": 4.317100334109548e-05, + "loss": 0.9721, + "step": 11236 + }, + { + "epoch": 3.410836242221885, + "grad_norm": 1.0907219648361206, + "learning_rate": 4.316594107522527e-05, + "loss": 0.9055, + "step": 11237 + }, + { + "epoch": 3.411139778418576, + "grad_norm": 1.184714436531067, + "learning_rate": 4.3160878809355074e-05, + "loss": 0.8599, + "step": 11238 + }, + { + "epoch": 3.411443314615268, + "grad_norm": 1.0400197505950928, + "learning_rate": 4.315581654348487e-05, + "loss": 1.2383, + "step": 11239 + }, + { + "epoch": 3.4117468508119595, + "grad_norm": 0.9222926497459412, + "learning_rate": 4.315075427761466e-05, + "loss": 1.1983, + "step": 11240 + }, + { + "epoch": 3.4120503870086507, + "grad_norm": 1.2537994384765625, + "learning_rate": 4.314569201174446e-05, + "loss": 0.7849, + "step": 11241 + }, + { + "epoch": 3.4123539232053424, + "grad_norm": 0.8401445150375366, + "learning_rate": 4.314062974587425e-05, + "loss": 0.7858, + "step": 11242 + }, + { + "epoch": 3.4126574594020336, + "grad_norm": 1.2764816284179688, + "learning_rate": 4.3135567480004055e-05, + "loss": 0.8211, + "step": 11243 + }, + { + "epoch": 3.4129609955987252, + "grad_norm": 1.2055851221084595, + "learning_rate": 4.313050521413385e-05, + "loss": 1.0758, + "step": 11244 + }, + { + "epoch": 3.4132645317954164, + "grad_norm": 1.0433343648910522, + "learning_rate": 4.3125442948263644e-05, + "loss": 1.1693, + "step": 11245 + }, + { + "epoch": 3.413568067992108, + "grad_norm": 0.8601516485214233, + "learning_rate": 4.312038068239344e-05, + "loss": 1.4252, + "step": 11246 + }, + { + "epoch": 3.4138716041887998, + "grad_norm": 1.324394941329956, + "learning_rate": 4.311531841652324e-05, + "loss": 0.738, + "step": 11247 + }, + { + "epoch": 3.414175140385491, + "grad_norm": 1.1970574855804443, + "learning_rate": 4.3110256150653035e-05, + "loss": 0.7412, + "step": 11248 + }, + { + "epoch": 3.414478676582182, + "grad_norm": 0.8299745321273804, + "learning_rate": 4.310519388478283e-05, + "loss": 0.8765, + "step": 11249 + }, + { + "epoch": 3.414782212778874, + "grad_norm": 1.0265402793884277, + "learning_rate": 4.3100131618912625e-05, + "loss": 0.9961, + "step": 11250 + }, + { + "epoch": 3.4150857489755655, + "grad_norm": 1.1152005195617676, + "learning_rate": 4.309506935304242e-05, + "loss": 0.9048, + "step": 11251 + }, + { + "epoch": 3.4153892851722567, + "grad_norm": 0.7666769623756409, + "learning_rate": 4.309000708717222e-05, + "loss": 1.0308, + "step": 11252 + }, + { + "epoch": 3.4156928213689484, + "grad_norm": 1.1563115119934082, + "learning_rate": 4.3084944821302016e-05, + "loss": 0.7752, + "step": 11253 + }, + { + "epoch": 3.4159963575656396, + "grad_norm": 0.8613836765289307, + "learning_rate": 4.307988255543181e-05, + "loss": 0.9133, + "step": 11254 + }, + { + "epoch": 3.4162998937623312, + "grad_norm": 1.182199478149414, + "learning_rate": 4.3074820289561606e-05, + "loss": 0.8123, + "step": 11255 + }, + { + "epoch": 3.4166034299590224, + "grad_norm": 0.7655694484710693, + "learning_rate": 4.306975802369141e-05, + "loss": 0.5682, + "step": 11256 + }, + { + "epoch": 3.416906966155714, + "grad_norm": 0.6948187351226807, + "learning_rate": 4.30646957578212e-05, + "loss": 1.3668, + "step": 11257 + }, + { + "epoch": 3.4172105023524058, + "grad_norm": 1.0102118253707886, + "learning_rate": 4.3059633491951004e-05, + "loss": 1.0069, + "step": 11258 + }, + { + "epoch": 3.417514038549097, + "grad_norm": 0.9736801981925964, + "learning_rate": 4.30545712260808e-05, + "loss": 1.1001, + "step": 11259 + }, + { + "epoch": 3.4178175747457886, + "grad_norm": 1.0169196128845215, + "learning_rate": 4.304950896021059e-05, + "loss": 0.6751, + "step": 11260 + }, + { + "epoch": 3.41812111094248, + "grad_norm": 1.293623924255371, + "learning_rate": 4.304444669434039e-05, + "loss": 1.084, + "step": 11261 + }, + { + "epoch": 3.4184246471391715, + "grad_norm": 1.2346819639205933, + "learning_rate": 4.303938442847019e-05, + "loss": 0.991, + "step": 11262 + }, + { + "epoch": 3.4187281833358627, + "grad_norm": 1.078264594078064, + "learning_rate": 4.3034322162599984e-05, + "loss": 0.8449, + "step": 11263 + }, + { + "epoch": 3.4190317195325544, + "grad_norm": 1.0058984756469727, + "learning_rate": 4.302925989672978e-05, + "loss": 1.0157, + "step": 11264 + }, + { + "epoch": 3.4193352557292456, + "grad_norm": 0.8553008437156677, + "learning_rate": 4.3024197630859574e-05, + "loss": 0.3934, + "step": 11265 + }, + { + "epoch": 3.4196387919259372, + "grad_norm": 0.8010063171386719, + "learning_rate": 4.301913536498937e-05, + "loss": 1.0249, + "step": 11266 + }, + { + "epoch": 3.4199423281226284, + "grad_norm": 1.1464972496032715, + "learning_rate": 4.301407309911917e-05, + "loss": 0.6448, + "step": 11267 + }, + { + "epoch": 3.42024586431932, + "grad_norm": 1.0349953174591064, + "learning_rate": 4.3009010833248965e-05, + "loss": 1.2323, + "step": 11268 + }, + { + "epoch": 3.4205494005160118, + "grad_norm": 1.1629717350006104, + "learning_rate": 4.300394856737876e-05, + "loss": 0.8618, + "step": 11269 + }, + { + "epoch": 3.420852936712703, + "grad_norm": 1.0424190759658813, + "learning_rate": 4.2998886301508555e-05, + "loss": 1.0391, + "step": 11270 + }, + { + "epoch": 3.4211564729093946, + "grad_norm": 1.2689813375473022, + "learning_rate": 4.299382403563835e-05, + "loss": 0.7986, + "step": 11271 + }, + { + "epoch": 3.421460009106086, + "grad_norm": 1.1151927709579468, + "learning_rate": 4.298876176976815e-05, + "loss": 0.4261, + "step": 11272 + }, + { + "epoch": 3.4217635453027775, + "grad_norm": 0.666191816329956, + "learning_rate": 4.2983699503897946e-05, + "loss": 0.1871, + "step": 11273 + }, + { + "epoch": 3.4220670814994687, + "grad_norm": 1.1634200811386108, + "learning_rate": 4.297863723802774e-05, + "loss": 1.3371, + "step": 11274 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.9470701217651367, + "learning_rate": 4.2973574972157535e-05, + "loss": 0.5549, + "step": 11275 + }, + { + "epoch": 3.4226741538928516, + "grad_norm": 1.0260618925094604, + "learning_rate": 4.296851270628734e-05, + "loss": 0.5021, + "step": 11276 + }, + { + "epoch": 3.4229776900895432, + "grad_norm": 0.9552202224731445, + "learning_rate": 4.296345044041713e-05, + "loss": 1.0849, + "step": 11277 + }, + { + "epoch": 3.4232812262862344, + "grad_norm": 1.0882254838943481, + "learning_rate": 4.295838817454693e-05, + "loss": 0.9356, + "step": 11278 + }, + { + "epoch": 3.423584762482926, + "grad_norm": 0.9959467053413391, + "learning_rate": 4.295332590867673e-05, + "loss": 1.2667, + "step": 11279 + }, + { + "epoch": 3.4238882986796177, + "grad_norm": 0.8414683938026428, + "learning_rate": 4.294826364280652e-05, + "loss": 0.961, + "step": 11280 + }, + { + "epoch": 3.424191834876309, + "grad_norm": 0.8000266551971436, + "learning_rate": 4.294320137693632e-05, + "loss": 1.4283, + "step": 11281 + }, + { + "epoch": 3.4244953710730006, + "grad_norm": 0.820584237575531, + "learning_rate": 4.293813911106612e-05, + "loss": 1.1261, + "step": 11282 + }, + { + "epoch": 3.424798907269692, + "grad_norm": 0.8128142952919006, + "learning_rate": 4.2933076845195914e-05, + "loss": 1.5669, + "step": 11283 + }, + { + "epoch": 3.4251024434663835, + "grad_norm": 1.0258336067199707, + "learning_rate": 4.292801457932571e-05, + "loss": 0.8316, + "step": 11284 + }, + { + "epoch": 3.4254059796630747, + "grad_norm": 1.1615201234817505, + "learning_rate": 4.2922952313455504e-05, + "loss": 0.6697, + "step": 11285 + }, + { + "epoch": 3.4257095158597664, + "grad_norm": 1.1560593843460083, + "learning_rate": 4.2917890047585305e-05, + "loss": 0.8922, + "step": 11286 + }, + { + "epoch": 3.4260130520564576, + "grad_norm": 0.903582751750946, + "learning_rate": 4.29128277817151e-05, + "loss": 0.5935, + "step": 11287 + }, + { + "epoch": 3.426316588253149, + "grad_norm": 1.1680340766906738, + "learning_rate": 4.2907765515844895e-05, + "loss": 1.3191, + "step": 11288 + }, + { + "epoch": 3.4266201244498404, + "grad_norm": 0.8603793978691101, + "learning_rate": 4.290270324997469e-05, + "loss": 0.515, + "step": 11289 + }, + { + "epoch": 3.426923660646532, + "grad_norm": 1.0343849658966064, + "learning_rate": 4.2897640984104484e-05, + "loss": 0.96, + "step": 11290 + }, + { + "epoch": 3.4272271968432237, + "grad_norm": 0.9173650741577148, + "learning_rate": 4.2892578718234286e-05, + "loss": 0.8257, + "step": 11291 + }, + { + "epoch": 3.427530733039915, + "grad_norm": 0.9393150210380554, + "learning_rate": 4.288751645236408e-05, + "loss": 1.0998, + "step": 11292 + }, + { + "epoch": 3.4278342692366066, + "grad_norm": 0.8886203765869141, + "learning_rate": 4.2882454186493876e-05, + "loss": 0.6702, + "step": 11293 + }, + { + "epoch": 3.428137805433298, + "grad_norm": 1.00777006149292, + "learning_rate": 4.287739192062367e-05, + "loss": 0.7967, + "step": 11294 + }, + { + "epoch": 3.4284413416299895, + "grad_norm": 0.9954153299331665, + "learning_rate": 4.2872329654753465e-05, + "loss": 0.8525, + "step": 11295 + }, + { + "epoch": 3.4287448778266807, + "grad_norm": 1.0022521018981934, + "learning_rate": 4.286726738888327e-05, + "loss": 0.9873, + "step": 11296 + }, + { + "epoch": 3.4290484140233723, + "grad_norm": 1.0121830701828003, + "learning_rate": 4.286220512301306e-05, + "loss": 1.2221, + "step": 11297 + }, + { + "epoch": 3.4293519502200636, + "grad_norm": 1.0499510765075684, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.6507, + "step": 11298 + }, + { + "epoch": 3.429655486416755, + "grad_norm": 0.9530588984489441, + "learning_rate": 4.285208059127265e-05, + "loss": 0.7175, + "step": 11299 + }, + { + "epoch": 3.4299590226134464, + "grad_norm": 0.8611896634101868, + "learning_rate": 4.284701832540245e-05, + "loss": 0.4454, + "step": 11300 + }, + { + "epoch": 3.430262558810138, + "grad_norm": 1.309035062789917, + "learning_rate": 4.2841956059532254e-05, + "loss": 0.687, + "step": 11301 + }, + { + "epoch": 3.4305660950068297, + "grad_norm": 0.7603581547737122, + "learning_rate": 4.283689379366205e-05, + "loss": 1.4037, + "step": 11302 + }, + { + "epoch": 3.430869631203521, + "grad_norm": 1.0000485181808472, + "learning_rate": 4.2831831527791844e-05, + "loss": 0.8296, + "step": 11303 + }, + { + "epoch": 3.4311731674002126, + "grad_norm": 0.9895287752151489, + "learning_rate": 4.282676926192164e-05, + "loss": 0.6996, + "step": 11304 + }, + { + "epoch": 3.431476703596904, + "grad_norm": 0.8913344740867615, + "learning_rate": 4.2821706996051433e-05, + "loss": 0.54, + "step": 11305 + }, + { + "epoch": 3.4317802397935955, + "grad_norm": 1.1585955619812012, + "learning_rate": 4.2816644730181235e-05, + "loss": 1.1106, + "step": 11306 + }, + { + "epoch": 3.4320837759902867, + "grad_norm": 1.2317397594451904, + "learning_rate": 4.281158246431103e-05, + "loss": 0.8583, + "step": 11307 + }, + { + "epoch": 3.4323873121869783, + "grad_norm": 0.9068469405174255, + "learning_rate": 4.2806520198440825e-05, + "loss": 1.1422, + "step": 11308 + }, + { + "epoch": 3.43269084838367, + "grad_norm": 0.7429954409599304, + "learning_rate": 4.280145793257062e-05, + "loss": 1.0565, + "step": 11309 + }, + { + "epoch": 3.432994384580361, + "grad_norm": 0.9319629669189453, + "learning_rate": 4.2796395666700414e-05, + "loss": 1.0439, + "step": 11310 + }, + { + "epoch": 3.4332979207770524, + "grad_norm": 1.1577261686325073, + "learning_rate": 4.2791333400830216e-05, + "loss": 1.0824, + "step": 11311 + }, + { + "epoch": 3.433601456973744, + "grad_norm": 1.0466327667236328, + "learning_rate": 4.278627113496001e-05, + "loss": 0.7807, + "step": 11312 + }, + { + "epoch": 3.4339049931704357, + "grad_norm": 1.1635687351226807, + "learning_rate": 4.2781208869089805e-05, + "loss": 0.968, + "step": 11313 + }, + { + "epoch": 3.434208529367127, + "grad_norm": 0.9586396217346191, + "learning_rate": 4.27761466032196e-05, + "loss": 1.0939, + "step": 11314 + }, + { + "epoch": 3.4345120655638186, + "grad_norm": 0.7606629133224487, + "learning_rate": 4.27710843373494e-05, + "loss": 0.6852, + "step": 11315 + }, + { + "epoch": 3.43481560176051, + "grad_norm": 0.8569469451904297, + "learning_rate": 4.2766022071479196e-05, + "loss": 0.8764, + "step": 11316 + }, + { + "epoch": 3.4351191379572015, + "grad_norm": 1.0692583322525024, + "learning_rate": 4.276095980560899e-05, + "loss": 1.0139, + "step": 11317 + }, + { + "epoch": 3.4354226741538927, + "grad_norm": 1.1606030464172363, + "learning_rate": 4.2755897539738786e-05, + "loss": 0.9048, + "step": 11318 + }, + { + "epoch": 3.4357262103505843, + "grad_norm": 1.0916180610656738, + "learning_rate": 4.275083527386858e-05, + "loss": 0.7068, + "step": 11319 + }, + { + "epoch": 3.436029746547276, + "grad_norm": 1.0152596235275269, + "learning_rate": 4.274577300799838e-05, + "loss": 0.3156, + "step": 11320 + }, + { + "epoch": 3.436333282743967, + "grad_norm": 0.8738464117050171, + "learning_rate": 4.274071074212818e-05, + "loss": 0.5474, + "step": 11321 + }, + { + "epoch": 3.436636818940659, + "grad_norm": 1.0640361309051514, + "learning_rate": 4.273564847625798e-05, + "loss": 1.1321, + "step": 11322 + }, + { + "epoch": 3.43694035513735, + "grad_norm": 1.0717283487319946, + "learning_rate": 4.2730586210387774e-05, + "loss": 0.7346, + "step": 11323 + }, + { + "epoch": 3.4372438913340417, + "grad_norm": 1.1339484453201294, + "learning_rate": 4.272552394451757e-05, + "loss": 0.8186, + "step": 11324 + }, + { + "epoch": 3.437547427530733, + "grad_norm": 0.8488152027130127, + "learning_rate": 4.272046167864737e-05, + "loss": 0.7444, + "step": 11325 + }, + { + "epoch": 3.4378509637274246, + "grad_norm": 1.0202581882476807, + "learning_rate": 4.2715399412777165e-05, + "loss": 0.8271, + "step": 11326 + }, + { + "epoch": 3.438154499924116, + "grad_norm": 0.9720146656036377, + "learning_rate": 4.271033714690696e-05, + "loss": 0.9757, + "step": 11327 + }, + { + "epoch": 3.4384580361208075, + "grad_norm": 1.1949342489242554, + "learning_rate": 4.2705274881036754e-05, + "loss": 0.8732, + "step": 11328 + }, + { + "epoch": 3.4387615723174987, + "grad_norm": 0.870389997959137, + "learning_rate": 4.270021261516655e-05, + "loss": 0.9737, + "step": 11329 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 1.0162367820739746, + "learning_rate": 4.269515034929635e-05, + "loss": 1.5704, + "step": 11330 + }, + { + "epoch": 3.439368644710882, + "grad_norm": 0.9661878943443298, + "learning_rate": 4.2690088083426145e-05, + "loss": 1.1433, + "step": 11331 + }, + { + "epoch": 3.439672180907573, + "grad_norm": 1.0458884239196777, + "learning_rate": 4.268502581755594e-05, + "loss": 0.9945, + "step": 11332 + }, + { + "epoch": 3.439975717104265, + "grad_norm": 1.3290009498596191, + "learning_rate": 4.2679963551685735e-05, + "loss": 1.1717, + "step": 11333 + }, + { + "epoch": 3.440279253300956, + "grad_norm": 1.1976438760757446, + "learning_rate": 4.267490128581553e-05, + "loss": 0.8268, + "step": 11334 + }, + { + "epoch": 3.4405827894976477, + "grad_norm": 1.0173789262771606, + "learning_rate": 4.266983901994533e-05, + "loss": 1.0672, + "step": 11335 + }, + { + "epoch": 3.440886325694339, + "grad_norm": 1.0618650913238525, + "learning_rate": 4.2664776754075126e-05, + "loss": 0.8135, + "step": 11336 + }, + { + "epoch": 3.4411898618910306, + "grad_norm": 1.1215393543243408, + "learning_rate": 4.265971448820492e-05, + "loss": 0.492, + "step": 11337 + }, + { + "epoch": 3.441493398087722, + "grad_norm": 0.922492265701294, + "learning_rate": 4.2654652222334716e-05, + "loss": 1.0621, + "step": 11338 + }, + { + "epoch": 3.4417969342844135, + "grad_norm": 0.8850914835929871, + "learning_rate": 4.264958995646452e-05, + "loss": 1.1909, + "step": 11339 + }, + { + "epoch": 3.4421004704811047, + "grad_norm": 0.6576152443885803, + "learning_rate": 4.264452769059431e-05, + "loss": 1.0862, + "step": 11340 + }, + { + "epoch": 3.4424040066777963, + "grad_norm": 1.0998018980026245, + "learning_rate": 4.263946542472411e-05, + "loss": 0.7974, + "step": 11341 + }, + { + "epoch": 3.442707542874488, + "grad_norm": 0.8351936340332031, + "learning_rate": 4.26344031588539e-05, + "loss": 1.2674, + "step": 11342 + }, + { + "epoch": 3.443011079071179, + "grad_norm": 0.9735302329063416, + "learning_rate": 4.2629340892983697e-05, + "loss": 1.0371, + "step": 11343 + }, + { + "epoch": 3.443314615267871, + "grad_norm": 1.0718032121658325, + "learning_rate": 4.26242786271135e-05, + "loss": 0.5843, + "step": 11344 + }, + { + "epoch": 3.443618151464562, + "grad_norm": 1.026712417602539, + "learning_rate": 4.26192163612433e-05, + "loss": 0.5302, + "step": 11345 + }, + { + "epoch": 3.4439216876612537, + "grad_norm": 1.022610068321228, + "learning_rate": 4.2614154095373095e-05, + "loss": 0.72, + "step": 11346 + }, + { + "epoch": 3.444225223857945, + "grad_norm": 1.1872667074203491, + "learning_rate": 4.260909182950289e-05, + "loss": 1.0077, + "step": 11347 + }, + { + "epoch": 3.4445287600546366, + "grad_norm": 1.053798794746399, + "learning_rate": 4.2604029563632684e-05, + "loss": 0.9641, + "step": 11348 + }, + { + "epoch": 3.444832296251328, + "grad_norm": 0.8300012946128845, + "learning_rate": 4.259896729776248e-05, + "loss": 0.8664, + "step": 11349 + }, + { + "epoch": 3.4451358324480195, + "grad_norm": 1.3486764430999756, + "learning_rate": 4.259390503189228e-05, + "loss": 0.8037, + "step": 11350 + }, + { + "epoch": 3.4454393686447107, + "grad_norm": 1.0833659172058105, + "learning_rate": 4.2588842766022075e-05, + "loss": 1.0251, + "step": 11351 + }, + { + "epoch": 3.4457429048414023, + "grad_norm": 0.9971094727516174, + "learning_rate": 4.258378050015187e-05, + "loss": 1.1336, + "step": 11352 + }, + { + "epoch": 3.446046441038094, + "grad_norm": 1.090692400932312, + "learning_rate": 4.2578718234281665e-05, + "loss": 1.0548, + "step": 11353 + }, + { + "epoch": 3.446349977234785, + "grad_norm": 1.1356785297393799, + "learning_rate": 4.2573655968411466e-05, + "loss": 0.9418, + "step": 11354 + }, + { + "epoch": 3.446653513431477, + "grad_norm": 1.0246812105178833, + "learning_rate": 4.256859370254126e-05, + "loss": 0.7766, + "step": 11355 + }, + { + "epoch": 3.446957049628168, + "grad_norm": 1.0563921928405762, + "learning_rate": 4.2563531436671056e-05, + "loss": 0.6897, + "step": 11356 + }, + { + "epoch": 3.4472605858248597, + "grad_norm": 0.6654698252677917, + "learning_rate": 4.255846917080085e-05, + "loss": 1.1148, + "step": 11357 + }, + { + "epoch": 3.447564122021551, + "grad_norm": 0.9102757573127747, + "learning_rate": 4.2553406904930646e-05, + "loss": 0.9296, + "step": 11358 + }, + { + "epoch": 3.4478676582182426, + "grad_norm": 0.9190248250961304, + "learning_rate": 4.254834463906045e-05, + "loss": 0.7979, + "step": 11359 + }, + { + "epoch": 3.448171194414934, + "grad_norm": 0.8101600408554077, + "learning_rate": 4.254328237319024e-05, + "loss": 1.2089, + "step": 11360 + }, + { + "epoch": 3.4484747306116255, + "grad_norm": 0.8447693586349487, + "learning_rate": 4.253822010732004e-05, + "loss": 0.6658, + "step": 11361 + }, + { + "epoch": 3.4487782668083167, + "grad_norm": 1.1456685066223145, + "learning_rate": 4.253315784144983e-05, + "loss": 1.0936, + "step": 11362 + }, + { + "epoch": 3.4490818030050083, + "grad_norm": 0.7697335481643677, + "learning_rate": 4.2528095575579626e-05, + "loss": 1.1915, + "step": 11363 + }, + { + "epoch": 3.4493853392017, + "grad_norm": 1.0871517658233643, + "learning_rate": 4.252303330970943e-05, + "loss": 0.7598, + "step": 11364 + }, + { + "epoch": 3.449688875398391, + "grad_norm": 0.9080920815467834, + "learning_rate": 4.251797104383922e-05, + "loss": 1.1593, + "step": 11365 + }, + { + "epoch": 3.449992411595083, + "grad_norm": 1.0452148914337158, + "learning_rate": 4.251290877796902e-05, + "loss": 0.8694, + "step": 11366 + }, + { + "epoch": 3.450295947791774, + "grad_norm": 0.8897637724876404, + "learning_rate": 4.250784651209882e-05, + "loss": 0.59, + "step": 11367 + }, + { + "epoch": 3.4505994839884657, + "grad_norm": 0.8299393057823181, + "learning_rate": 4.2502784246228614e-05, + "loss": 0.976, + "step": 11368 + }, + { + "epoch": 3.450903020185157, + "grad_norm": 0.818630576133728, + "learning_rate": 4.2497721980358415e-05, + "loss": 1.3136, + "step": 11369 + }, + { + "epoch": 3.4512065563818486, + "grad_norm": 0.8682777285575867, + "learning_rate": 4.249265971448821e-05, + "loss": 0.4005, + "step": 11370 + }, + { + "epoch": 3.45151009257854, + "grad_norm": 1.15293550491333, + "learning_rate": 4.2487597448618005e-05, + "loss": 0.8882, + "step": 11371 + }, + { + "epoch": 3.4518136287752315, + "grad_norm": 0.8427293300628662, + "learning_rate": 4.24825351827478e-05, + "loss": 1.3444, + "step": 11372 + }, + { + "epoch": 3.4521171649719227, + "grad_norm": 1.0733944177627563, + "learning_rate": 4.2477472916877595e-05, + "loss": 1.0548, + "step": 11373 + }, + { + "epoch": 3.4524207011686143, + "grad_norm": 1.0374140739440918, + "learning_rate": 4.2472410651007396e-05, + "loss": 1.309, + "step": 11374 + }, + { + "epoch": 3.452724237365306, + "grad_norm": 1.1677690744400024, + "learning_rate": 4.246734838513719e-05, + "loss": 0.9801, + "step": 11375 + }, + { + "epoch": 3.453027773561997, + "grad_norm": 0.8594178557395935, + "learning_rate": 4.2462286119266986e-05, + "loss": 0.8199, + "step": 11376 + }, + { + "epoch": 3.453331309758689, + "grad_norm": 1.0336246490478516, + "learning_rate": 4.245722385339678e-05, + "loss": 0.7648, + "step": 11377 + }, + { + "epoch": 3.45363484595538, + "grad_norm": 0.9998457431793213, + "learning_rate": 4.245216158752658e-05, + "loss": 0.9647, + "step": 11378 + }, + { + "epoch": 3.4539383821520717, + "grad_norm": 0.8997493386268616, + "learning_rate": 4.244709932165638e-05, + "loss": 0.9831, + "step": 11379 + }, + { + "epoch": 3.454241918348763, + "grad_norm": 0.9620414972305298, + "learning_rate": 4.244203705578617e-05, + "loss": 0.7376, + "step": 11380 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 0.9143856167793274, + "learning_rate": 4.2436974789915967e-05, + "loss": 1.1844, + "step": 11381 + }, + { + "epoch": 3.4548489907421462, + "grad_norm": 1.0713844299316406, + "learning_rate": 4.243191252404576e-05, + "loss": 0.7831, + "step": 11382 + }, + { + "epoch": 3.4551525269388375, + "grad_norm": 1.0755388736724854, + "learning_rate": 4.242685025817556e-05, + "loss": 0.7423, + "step": 11383 + }, + { + "epoch": 3.4554560631355287, + "grad_norm": 1.0151540040969849, + "learning_rate": 4.242178799230536e-05, + "loss": 1.0222, + "step": 11384 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.9341586232185364, + "learning_rate": 4.241672572643515e-05, + "loss": 1.0342, + "step": 11385 + }, + { + "epoch": 3.456063135528912, + "grad_norm": 1.199669599533081, + "learning_rate": 4.241166346056495e-05, + "loss": 0.7258, + "step": 11386 + }, + { + "epoch": 3.456366671725603, + "grad_norm": 0.8179941773414612, + "learning_rate": 4.240660119469474e-05, + "loss": 1.4185, + "step": 11387 + }, + { + "epoch": 3.456670207922295, + "grad_norm": 0.8720028400421143, + "learning_rate": 4.2401538928824544e-05, + "loss": 0.7346, + "step": 11388 + }, + { + "epoch": 3.456973744118986, + "grad_norm": 0.8772128820419312, + "learning_rate": 4.2396476662954345e-05, + "loss": 1.1054, + "step": 11389 + }, + { + "epoch": 3.4572772803156777, + "grad_norm": 1.2960760593414307, + "learning_rate": 4.239141439708414e-05, + "loss": 0.9452, + "step": 11390 + }, + { + "epoch": 3.457580816512369, + "grad_norm": 1.3870878219604492, + "learning_rate": 4.2386352131213935e-05, + "loss": 1.1864, + "step": 11391 + }, + { + "epoch": 3.4578843527090606, + "grad_norm": 1.0856776237487793, + "learning_rate": 4.238128986534373e-05, + "loss": 0.5225, + "step": 11392 + }, + { + "epoch": 3.4581878889057522, + "grad_norm": 0.8393153548240662, + "learning_rate": 4.237622759947353e-05, + "loss": 0.7185, + "step": 11393 + }, + { + "epoch": 3.4584914251024435, + "grad_norm": 1.0542411804199219, + "learning_rate": 4.2371165333603326e-05, + "loss": 1.0479, + "step": 11394 + }, + { + "epoch": 3.458794961299135, + "grad_norm": 1.0236605405807495, + "learning_rate": 4.236610306773312e-05, + "loss": 0.8855, + "step": 11395 + }, + { + "epoch": 3.4590984974958263, + "grad_norm": 0.9843754172325134, + "learning_rate": 4.2361040801862916e-05, + "loss": 0.569, + "step": 11396 + }, + { + "epoch": 3.459402033692518, + "grad_norm": 1.1107150316238403, + "learning_rate": 4.235597853599271e-05, + "loss": 0.4897, + "step": 11397 + }, + { + "epoch": 3.459705569889209, + "grad_norm": 1.1543493270874023, + "learning_rate": 4.235091627012251e-05, + "loss": 1.0492, + "step": 11398 + }, + { + "epoch": 3.460009106085901, + "grad_norm": 1.1336286067962646, + "learning_rate": 4.234585400425231e-05, + "loss": 0.8287, + "step": 11399 + }, + { + "epoch": 3.460312642282592, + "grad_norm": 1.0350695848464966, + "learning_rate": 4.23407917383821e-05, + "loss": 1.1832, + "step": 11400 + }, + { + "epoch": 3.4606161784792837, + "grad_norm": 1.234724521636963, + "learning_rate": 4.2335729472511896e-05, + "loss": 0.8863, + "step": 11401 + }, + { + "epoch": 3.460919714675975, + "grad_norm": 1.0122151374816895, + "learning_rate": 4.233066720664169e-05, + "loss": 1.206, + "step": 11402 + }, + { + "epoch": 3.4612232508726666, + "grad_norm": 1.1283066272735596, + "learning_rate": 4.232560494077149e-05, + "loss": 0.7401, + "step": 11403 + }, + { + "epoch": 3.4615267870693582, + "grad_norm": 0.869615912437439, + "learning_rate": 4.232054267490129e-05, + "loss": 0.7858, + "step": 11404 + }, + { + "epoch": 3.4618303232660494, + "grad_norm": 1.0001555681228638, + "learning_rate": 4.231548040903108e-05, + "loss": 0.9936, + "step": 11405 + }, + { + "epoch": 3.462133859462741, + "grad_norm": 1.1988389492034912, + "learning_rate": 4.231041814316088e-05, + "loss": 0.5019, + "step": 11406 + }, + { + "epoch": 3.4624373956594323, + "grad_norm": 0.9713451266288757, + "learning_rate": 4.230535587729068e-05, + "loss": 0.6751, + "step": 11407 + }, + { + "epoch": 3.462740931856124, + "grad_norm": 0.9207466840744019, + "learning_rate": 4.2300293611420473e-05, + "loss": 1.0762, + "step": 11408 + }, + { + "epoch": 3.463044468052815, + "grad_norm": 1.0005989074707031, + "learning_rate": 4.229523134555027e-05, + "loss": 0.5641, + "step": 11409 + }, + { + "epoch": 3.463348004249507, + "grad_norm": 1.1158218383789062, + "learning_rate": 4.229016907968006e-05, + "loss": 0.7414, + "step": 11410 + }, + { + "epoch": 3.463651540446198, + "grad_norm": 1.0111825466156006, + "learning_rate": 4.2285106813809865e-05, + "loss": 0.5438, + "step": 11411 + }, + { + "epoch": 3.4639550766428897, + "grad_norm": 1.0516672134399414, + "learning_rate": 4.228004454793966e-05, + "loss": 1.1537, + "step": 11412 + }, + { + "epoch": 3.464258612839581, + "grad_norm": 1.1548079252243042, + "learning_rate": 4.227498228206946e-05, + "loss": 0.9079, + "step": 11413 + }, + { + "epoch": 3.4645621490362726, + "grad_norm": 0.8801454901695251, + "learning_rate": 4.2269920016199256e-05, + "loss": 1.1744, + "step": 11414 + }, + { + "epoch": 3.4648656852329642, + "grad_norm": 1.1711843013763428, + "learning_rate": 4.226485775032905e-05, + "loss": 0.5712, + "step": 11415 + }, + { + "epoch": 3.4651692214296554, + "grad_norm": 0.9582653641700745, + "learning_rate": 4.2259795484458845e-05, + "loss": 1.0291, + "step": 11416 + }, + { + "epoch": 3.465472757626347, + "grad_norm": 1.083159327507019, + "learning_rate": 4.225473321858865e-05, + "loss": 0.7452, + "step": 11417 + }, + { + "epoch": 3.4657762938230383, + "grad_norm": 0.7866515517234802, + "learning_rate": 4.224967095271844e-05, + "loss": 0.7906, + "step": 11418 + }, + { + "epoch": 3.46607983001973, + "grad_norm": 0.8159627318382263, + "learning_rate": 4.2244608686848236e-05, + "loss": 1.0576, + "step": 11419 + }, + { + "epoch": 3.466383366216421, + "grad_norm": 0.8906406760215759, + "learning_rate": 4.223954642097803e-05, + "loss": 1.3358, + "step": 11420 + }, + { + "epoch": 3.466686902413113, + "grad_norm": 0.8896162509918213, + "learning_rate": 4.2234484155107826e-05, + "loss": 1.177, + "step": 11421 + }, + { + "epoch": 3.466990438609804, + "grad_norm": 0.947354793548584, + "learning_rate": 4.222942188923763e-05, + "loss": 0.5972, + "step": 11422 + }, + { + "epoch": 3.4672939748064957, + "grad_norm": 1.0936514139175415, + "learning_rate": 4.222435962336742e-05, + "loss": 0.5815, + "step": 11423 + }, + { + "epoch": 3.467597511003187, + "grad_norm": 1.2376407384872437, + "learning_rate": 4.221929735749722e-05, + "loss": 0.7873, + "step": 11424 + }, + { + "epoch": 3.4679010471998786, + "grad_norm": 1.1659278869628906, + "learning_rate": 4.221423509162701e-05, + "loss": 0.8379, + "step": 11425 + }, + { + "epoch": 3.4682045833965702, + "grad_norm": 0.6670882701873779, + "learning_rate": 4.220917282575681e-05, + "loss": 1.3353, + "step": 11426 + }, + { + "epoch": 3.4685081195932614, + "grad_norm": 1.0464056730270386, + "learning_rate": 4.220411055988661e-05, + "loss": 0.6465, + "step": 11427 + }, + { + "epoch": 3.468811655789953, + "grad_norm": 0.8475186824798584, + "learning_rate": 4.21990482940164e-05, + "loss": 0.4934, + "step": 11428 + }, + { + "epoch": 3.4691151919866443, + "grad_norm": 1.0401862859725952, + "learning_rate": 4.21939860281462e-05, + "loss": 0.9686, + "step": 11429 + }, + { + "epoch": 3.469418728183336, + "grad_norm": 1.041549801826477, + "learning_rate": 4.218892376227599e-05, + "loss": 0.9022, + "step": 11430 + }, + { + "epoch": 3.469722264380027, + "grad_norm": 0.9369441270828247, + "learning_rate": 4.2183861496405794e-05, + "loss": 0.8779, + "step": 11431 + }, + { + "epoch": 3.470025800576719, + "grad_norm": 1.0543452501296997, + "learning_rate": 4.217879923053559e-05, + "loss": 0.8848, + "step": 11432 + }, + { + "epoch": 3.47032933677341, + "grad_norm": 1.1141040325164795, + "learning_rate": 4.2173736964665384e-05, + "loss": 1.0235, + "step": 11433 + }, + { + "epoch": 3.4706328729701017, + "grad_norm": 1.0424753427505493, + "learning_rate": 4.2168674698795186e-05, + "loss": 0.8997, + "step": 11434 + }, + { + "epoch": 3.470936409166793, + "grad_norm": 0.9775320887565613, + "learning_rate": 4.216361243292498e-05, + "loss": 0.9789, + "step": 11435 + }, + { + "epoch": 3.4712399453634846, + "grad_norm": 1.34420907497406, + "learning_rate": 4.2158550167054775e-05, + "loss": 0.7055, + "step": 11436 + }, + { + "epoch": 3.4715434815601762, + "grad_norm": 0.8173890709877014, + "learning_rate": 4.215348790118458e-05, + "loss": 0.7523, + "step": 11437 + }, + { + "epoch": 3.4718470177568674, + "grad_norm": 1.0417473316192627, + "learning_rate": 4.214842563531437e-05, + "loss": 0.8027, + "step": 11438 + }, + { + "epoch": 3.472150553953559, + "grad_norm": 1.0889456272125244, + "learning_rate": 4.2143363369444166e-05, + "loss": 1.2364, + "step": 11439 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 1.1871832609176636, + "learning_rate": 4.213830110357396e-05, + "loss": 0.9615, + "step": 11440 + }, + { + "epoch": 3.472757626346942, + "grad_norm": 0.8793348670005798, + "learning_rate": 4.2133238837703756e-05, + "loss": 0.9709, + "step": 11441 + }, + { + "epoch": 3.473061162543633, + "grad_norm": 1.0387309789657593, + "learning_rate": 4.212817657183356e-05, + "loss": 1.3455, + "step": 11442 + }, + { + "epoch": 3.473364698740325, + "grad_norm": 0.9309706687927246, + "learning_rate": 4.212311430596335e-05, + "loss": 1.2227, + "step": 11443 + }, + { + "epoch": 3.4736682349370165, + "grad_norm": 0.9305075407028198, + "learning_rate": 4.211805204009315e-05, + "loss": 1.1316, + "step": 11444 + }, + { + "epoch": 3.4739717711337077, + "grad_norm": 1.0947800874710083, + "learning_rate": 4.211298977422294e-05, + "loss": 0.6821, + "step": 11445 + }, + { + "epoch": 3.474275307330399, + "grad_norm": 1.0373951196670532, + "learning_rate": 4.210792750835274e-05, + "loss": 0.4706, + "step": 11446 + }, + { + "epoch": 3.4745788435270906, + "grad_norm": 0.9517707824707031, + "learning_rate": 4.210286524248254e-05, + "loss": 1.1271, + "step": 11447 + }, + { + "epoch": 3.474882379723782, + "grad_norm": 0.9940754175186157, + "learning_rate": 4.209780297661233e-05, + "loss": 0.9014, + "step": 11448 + }, + { + "epoch": 3.4751859159204734, + "grad_norm": 0.9532588720321655, + "learning_rate": 4.209274071074213e-05, + "loss": 1.325, + "step": 11449 + }, + { + "epoch": 3.475489452117165, + "grad_norm": 1.0697945356369019, + "learning_rate": 4.208767844487192e-05, + "loss": 0.8582, + "step": 11450 + }, + { + "epoch": 3.4757929883138563, + "grad_norm": 0.8923087120056152, + "learning_rate": 4.2082616179001724e-05, + "loss": 1.2748, + "step": 11451 + }, + { + "epoch": 3.476096524510548, + "grad_norm": 0.8714694380760193, + "learning_rate": 4.207755391313152e-05, + "loss": 1.1616, + "step": 11452 + }, + { + "epoch": 3.476400060707239, + "grad_norm": 1.2103550434112549, + "learning_rate": 4.2072491647261314e-05, + "loss": 1.0126, + "step": 11453 + }, + { + "epoch": 3.476703596903931, + "grad_norm": 1.1456637382507324, + "learning_rate": 4.206742938139111e-05, + "loss": 0.8538, + "step": 11454 + }, + { + "epoch": 3.4770071331006225, + "grad_norm": 1.0365684032440186, + "learning_rate": 4.20623671155209e-05, + "loss": 0.7722, + "step": 11455 + }, + { + "epoch": 3.4773106692973137, + "grad_norm": 1.0593724250793457, + "learning_rate": 4.205730484965071e-05, + "loss": 1.1097, + "step": 11456 + }, + { + "epoch": 3.4776142054940054, + "grad_norm": 0.9655763506889343, + "learning_rate": 4.2052242583780506e-05, + "loss": 1.4288, + "step": 11457 + }, + { + "epoch": 3.4779177416906966, + "grad_norm": 0.9874686598777771, + "learning_rate": 4.20471803179103e-05, + "loss": 1.1566, + "step": 11458 + }, + { + "epoch": 3.478221277887388, + "grad_norm": 0.9987507462501526, + "learning_rate": 4.2042118052040096e-05, + "loss": 0.6967, + "step": 11459 + }, + { + "epoch": 3.4785248140840794, + "grad_norm": 0.8439647555351257, + "learning_rate": 4.203705578616989e-05, + "loss": 1.1145, + "step": 11460 + }, + { + "epoch": 3.478828350280771, + "grad_norm": 1.3992457389831543, + "learning_rate": 4.203199352029969e-05, + "loss": 0.8811, + "step": 11461 + }, + { + "epoch": 3.4791318864774623, + "grad_norm": 0.9427589774131775, + "learning_rate": 4.202693125442949e-05, + "loss": 1.0524, + "step": 11462 + }, + { + "epoch": 3.479435422674154, + "grad_norm": 1.0369971990585327, + "learning_rate": 4.202186898855928e-05, + "loss": 0.8197, + "step": 11463 + }, + { + "epoch": 3.479738958870845, + "grad_norm": 0.8164836764335632, + "learning_rate": 4.201680672268908e-05, + "loss": 1.4574, + "step": 11464 + }, + { + "epoch": 3.480042495067537, + "grad_norm": 0.9262059926986694, + "learning_rate": 4.201174445681887e-05, + "loss": 0.7735, + "step": 11465 + }, + { + "epoch": 3.4803460312642285, + "grad_norm": 0.9015330076217651, + "learning_rate": 4.200668219094867e-05, + "loss": 1.0098, + "step": 11466 + }, + { + "epoch": 3.4806495674609197, + "grad_norm": 1.1501796245574951, + "learning_rate": 4.200161992507847e-05, + "loss": 1.1718, + "step": 11467 + }, + { + "epoch": 3.4809531036576113, + "grad_norm": 1.141869068145752, + "learning_rate": 4.199655765920826e-05, + "loss": 1.116, + "step": 11468 + }, + { + "epoch": 3.4812566398543026, + "grad_norm": 1.0674519538879395, + "learning_rate": 4.199149539333806e-05, + "loss": 1.5515, + "step": 11469 + }, + { + "epoch": 3.481560176050994, + "grad_norm": 0.83263099193573, + "learning_rate": 4.198643312746786e-05, + "loss": 1.3109, + "step": 11470 + }, + { + "epoch": 3.4818637122476854, + "grad_norm": 1.1275105476379395, + "learning_rate": 4.1981370861597654e-05, + "loss": 1.1622, + "step": 11471 + }, + { + "epoch": 3.482167248444377, + "grad_norm": 0.8923318982124329, + "learning_rate": 4.197630859572745e-05, + "loss": 0.5513, + "step": 11472 + }, + { + "epoch": 3.4824707846410683, + "grad_norm": 1.037046194076538, + "learning_rate": 4.1971246329857243e-05, + "loss": 1.114, + "step": 11473 + }, + { + "epoch": 3.48277432083776, + "grad_norm": 0.6146632432937622, + "learning_rate": 4.196618406398704e-05, + "loss": 0.6133, + "step": 11474 + }, + { + "epoch": 3.483077857034451, + "grad_norm": 0.9399073123931885, + "learning_rate": 4.196112179811684e-05, + "loss": 1.1753, + "step": 11475 + }, + { + "epoch": 3.483381393231143, + "grad_norm": 0.8917518854141235, + "learning_rate": 4.1956059532246635e-05, + "loss": 0.7032, + "step": 11476 + }, + { + "epoch": 3.4836849294278345, + "grad_norm": 0.8698925971984863, + "learning_rate": 4.195099726637643e-05, + "loss": 1.3927, + "step": 11477 + }, + { + "epoch": 3.4839884656245257, + "grad_norm": 0.8484128713607788, + "learning_rate": 4.194593500050623e-05, + "loss": 1.0103, + "step": 11478 + }, + { + "epoch": 3.4842920018212173, + "grad_norm": 0.7555760145187378, + "learning_rate": 4.1940872734636026e-05, + "loss": 1.2282, + "step": 11479 + }, + { + "epoch": 3.4845955380179086, + "grad_norm": 1.209089756011963, + "learning_rate": 4.193581046876582e-05, + "loss": 0.8643, + "step": 11480 + }, + { + "epoch": 3.4848990742146, + "grad_norm": 1.0552500486373901, + "learning_rate": 4.193074820289562e-05, + "loss": 1.1186, + "step": 11481 + }, + { + "epoch": 3.4852026104112914, + "grad_norm": 0.7340880036354065, + "learning_rate": 4.192568593702542e-05, + "loss": 0.5461, + "step": 11482 + }, + { + "epoch": 3.485506146607983, + "grad_norm": 0.8937561511993408, + "learning_rate": 4.192062367115521e-05, + "loss": 1.2119, + "step": 11483 + }, + { + "epoch": 3.4858096828046743, + "grad_norm": 0.98643559217453, + "learning_rate": 4.1915561405285007e-05, + "loss": 0.985, + "step": 11484 + }, + { + "epoch": 3.486113219001366, + "grad_norm": 1.1967453956604004, + "learning_rate": 4.191049913941481e-05, + "loss": 1.189, + "step": 11485 + }, + { + "epoch": 3.486416755198057, + "grad_norm": 0.7951077222824097, + "learning_rate": 4.19054368735446e-05, + "loss": 0.7574, + "step": 11486 + }, + { + "epoch": 3.486720291394749, + "grad_norm": 1.1270830631256104, + "learning_rate": 4.19003746076744e-05, + "loss": 0.9813, + "step": 11487 + }, + { + "epoch": 3.4870238275914405, + "grad_norm": 1.181876540184021, + "learning_rate": 4.189531234180419e-05, + "loss": 0.8237, + "step": 11488 + }, + { + "epoch": 3.4873273637881317, + "grad_norm": 1.2361828088760376, + "learning_rate": 4.189025007593399e-05, + "loss": 0.9088, + "step": 11489 + }, + { + "epoch": 3.4876308999848233, + "grad_norm": 1.094062328338623, + "learning_rate": 4.188518781006379e-05, + "loss": 1.1796, + "step": 11490 + }, + { + "epoch": 3.4879344361815146, + "grad_norm": 0.9124131202697754, + "learning_rate": 4.1880125544193584e-05, + "loss": 0.9033, + "step": 11491 + }, + { + "epoch": 3.488237972378206, + "grad_norm": 1.0972652435302734, + "learning_rate": 4.187506327832338e-05, + "loss": 0.9968, + "step": 11492 + }, + { + "epoch": 3.4885415085748974, + "grad_norm": 1.0009212493896484, + "learning_rate": 4.187000101245317e-05, + "loss": 0.619, + "step": 11493 + }, + { + "epoch": 3.488845044771589, + "grad_norm": 1.04353928565979, + "learning_rate": 4.186493874658297e-05, + "loss": 0.3262, + "step": 11494 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 1.2046505212783813, + "learning_rate": 4.185987648071277e-05, + "loss": 0.9637, + "step": 11495 + }, + { + "epoch": 3.489452117164972, + "grad_norm": 0.9516655802726746, + "learning_rate": 4.1854814214842564e-05, + "loss": 1.3008, + "step": 11496 + }, + { + "epoch": 3.489755653361663, + "grad_norm": 0.7282963395118713, + "learning_rate": 4.184975194897236e-05, + "loss": 0.8494, + "step": 11497 + }, + { + "epoch": 3.490059189558355, + "grad_norm": 1.0606610774993896, + "learning_rate": 4.1844689683102154e-05, + "loss": 0.9991, + "step": 11498 + }, + { + "epoch": 3.4903627257550465, + "grad_norm": 1.0610661506652832, + "learning_rate": 4.1839627417231956e-05, + "loss": 0.4275, + "step": 11499 + }, + { + "epoch": 3.4906662619517377, + "grad_norm": 1.0951975584030151, + "learning_rate": 4.183456515136176e-05, + "loss": 0.937, + "step": 11500 + }, + { + "epoch": 3.4909697981484293, + "grad_norm": 1.05243718624115, + "learning_rate": 4.182950288549155e-05, + "loss": 0.8367, + "step": 11501 + }, + { + "epoch": 3.4912733343451205, + "grad_norm": 1.0030443668365479, + "learning_rate": 4.182444061962135e-05, + "loss": 0.6246, + "step": 11502 + }, + { + "epoch": 3.491576870541812, + "grad_norm": 0.9411659240722656, + "learning_rate": 4.181937835375114e-05, + "loss": 1.301, + "step": 11503 + }, + { + "epoch": 3.4918804067385034, + "grad_norm": 1.0777431726455688, + "learning_rate": 4.1814316087880936e-05, + "loss": 0.495, + "step": 11504 + }, + { + "epoch": 3.492183942935195, + "grad_norm": 0.8631514310836792, + "learning_rate": 4.180925382201074e-05, + "loss": 0.687, + "step": 11505 + }, + { + "epoch": 3.4924874791318863, + "grad_norm": 0.8915711045265198, + "learning_rate": 4.180419155614053e-05, + "loss": 1.2496, + "step": 11506 + }, + { + "epoch": 3.492791015328578, + "grad_norm": 0.9451521039009094, + "learning_rate": 4.179912929027033e-05, + "loss": 0.6648, + "step": 11507 + }, + { + "epoch": 3.493094551525269, + "grad_norm": 0.8736832141876221, + "learning_rate": 4.179406702440012e-05, + "loss": 1.24, + "step": 11508 + }, + { + "epoch": 3.493398087721961, + "grad_norm": 1.5248959064483643, + "learning_rate": 4.1789004758529924e-05, + "loss": 0.7368, + "step": 11509 + }, + { + "epoch": 3.4937016239186525, + "grad_norm": 0.8818194270133972, + "learning_rate": 4.178394249265972e-05, + "loss": 1.0387, + "step": 11510 + }, + { + "epoch": 3.4940051601153437, + "grad_norm": 1.0787771940231323, + "learning_rate": 4.1778880226789513e-05, + "loss": 0.8883, + "step": 11511 + }, + { + "epoch": 3.4943086963120353, + "grad_norm": 1.526505708694458, + "learning_rate": 4.177381796091931e-05, + "loss": 0.8415, + "step": 11512 + }, + { + "epoch": 3.4946122325087265, + "grad_norm": 1.2998838424682617, + "learning_rate": 4.17687556950491e-05, + "loss": 0.9793, + "step": 11513 + }, + { + "epoch": 3.494915768705418, + "grad_norm": 1.1652768850326538, + "learning_rate": 4.1763693429178905e-05, + "loss": 0.9198, + "step": 11514 + }, + { + "epoch": 3.4952193049021094, + "grad_norm": 1.1354137659072876, + "learning_rate": 4.17586311633087e-05, + "loss": 0.7469, + "step": 11515 + }, + { + "epoch": 3.495522841098801, + "grad_norm": 1.0571775436401367, + "learning_rate": 4.1753568897438494e-05, + "loss": 1.1789, + "step": 11516 + }, + { + "epoch": 3.4958263772954927, + "grad_norm": 1.1516374349594116, + "learning_rate": 4.174850663156829e-05, + "loss": 0.6836, + "step": 11517 + }, + { + "epoch": 3.496129913492184, + "grad_norm": 1.1770570278167725, + "learning_rate": 4.1743444365698084e-05, + "loss": 1.017, + "step": 11518 + }, + { + "epoch": 3.496433449688875, + "grad_norm": 0.9382774829864502, + "learning_rate": 4.1738382099827885e-05, + "loss": 1.0153, + "step": 11519 + }, + { + "epoch": 3.496736985885567, + "grad_norm": 0.8816243410110474, + "learning_rate": 4.173331983395768e-05, + "loss": 0.2258, + "step": 11520 + }, + { + "epoch": 3.4970405220822585, + "grad_norm": 1.095651388168335, + "learning_rate": 4.1728257568087475e-05, + "loss": 1.1353, + "step": 11521 + }, + { + "epoch": 3.4973440582789497, + "grad_norm": 1.0985361337661743, + "learning_rate": 4.172319530221727e-05, + "loss": 0.5652, + "step": 11522 + }, + { + "epoch": 3.4976475944756413, + "grad_norm": 0.8571131229400635, + "learning_rate": 4.171813303634707e-05, + "loss": 1.0663, + "step": 11523 + }, + { + "epoch": 3.4979511306723325, + "grad_norm": 1.074644923210144, + "learning_rate": 4.171307077047687e-05, + "loss": 1.0802, + "step": 11524 + }, + { + "epoch": 3.498254666869024, + "grad_norm": 1.118381142616272, + "learning_rate": 4.170800850460667e-05, + "loss": 0.8564, + "step": 11525 + }, + { + "epoch": 3.4985582030657154, + "grad_norm": 1.1617616415023804, + "learning_rate": 4.170294623873646e-05, + "loss": 0.9543, + "step": 11526 + }, + { + "epoch": 3.498861739262407, + "grad_norm": 1.0510815382003784, + "learning_rate": 4.169788397286626e-05, + "loss": 0.9253, + "step": 11527 + }, + { + "epoch": 3.4991652754590987, + "grad_norm": 1.2857627868652344, + "learning_rate": 4.169282170699605e-05, + "loss": 0.5656, + "step": 11528 + }, + { + "epoch": 3.49946881165579, + "grad_norm": 1.1572846174240112, + "learning_rate": 4.1687759441125854e-05, + "loss": 0.8958, + "step": 11529 + }, + { + "epoch": 3.4997723478524816, + "grad_norm": 1.0880467891693115, + "learning_rate": 4.168269717525565e-05, + "loss": 0.4713, + "step": 11530 + }, + { + "epoch": 3.500075884049173, + "grad_norm": 0.9316008687019348, + "learning_rate": 4.167763490938544e-05, + "loss": 0.8316, + "step": 11531 + }, + { + "epoch": 3.5003794202458645, + "grad_norm": 0.959507405757904, + "learning_rate": 4.167257264351524e-05, + "loss": 1.2981, + "step": 11532 + }, + { + "epoch": 3.5006829564425557, + "grad_norm": 0.7986817955970764, + "learning_rate": 4.166751037764503e-05, + "loss": 1.3842, + "step": 11533 + }, + { + "epoch": 3.5009864926392473, + "grad_norm": 1.037031650543213, + "learning_rate": 4.1662448111774834e-05, + "loss": 1.0915, + "step": 11534 + }, + { + "epoch": 3.5012900288359385, + "grad_norm": 0.9904575347900391, + "learning_rate": 4.165738584590463e-05, + "loss": 1.0866, + "step": 11535 + }, + { + "epoch": 3.50159356503263, + "grad_norm": 1.1318732500076294, + "learning_rate": 4.1652323580034424e-05, + "loss": 1.0147, + "step": 11536 + }, + { + "epoch": 3.5018971012293214, + "grad_norm": 0.9317408204078674, + "learning_rate": 4.164726131416422e-05, + "loss": 0.9158, + "step": 11537 + }, + { + "epoch": 3.502200637426013, + "grad_norm": 0.833384096622467, + "learning_rate": 4.164219904829402e-05, + "loss": 0.2761, + "step": 11538 + }, + { + "epoch": 3.5025041736227047, + "grad_norm": 0.8594617247581482, + "learning_rate": 4.1637136782423815e-05, + "loss": 0.8597, + "step": 11539 + }, + { + "epoch": 3.502807709819396, + "grad_norm": 0.622658371925354, + "learning_rate": 4.163207451655361e-05, + "loss": 1.3716, + "step": 11540 + }, + { + "epoch": 3.503111246016087, + "grad_norm": 1.2160855531692505, + "learning_rate": 4.1627012250683405e-05, + "loss": 0.8731, + "step": 11541 + }, + { + "epoch": 3.503414782212779, + "grad_norm": 0.874017596244812, + "learning_rate": 4.16219499848132e-05, + "loss": 0.993, + "step": 11542 + }, + { + "epoch": 3.5037183184094705, + "grad_norm": 1.1052695512771606, + "learning_rate": 4.1616887718943e-05, + "loss": 1.063, + "step": 11543 + }, + { + "epoch": 3.5040218546061617, + "grad_norm": 1.0710269212722778, + "learning_rate": 4.1611825453072796e-05, + "loss": 1.0071, + "step": 11544 + }, + { + "epoch": 3.5043253908028533, + "grad_norm": 1.1148152351379395, + "learning_rate": 4.16067631872026e-05, + "loss": 0.9155, + "step": 11545 + }, + { + "epoch": 3.5046289269995445, + "grad_norm": 1.045923113822937, + "learning_rate": 4.160170092133239e-05, + "loss": 0.9526, + "step": 11546 + }, + { + "epoch": 3.504932463196236, + "grad_norm": 1.1013811826705933, + "learning_rate": 4.159663865546219e-05, + "loss": 0.6117, + "step": 11547 + }, + { + "epoch": 3.5052359993929274, + "grad_norm": 1.039097547531128, + "learning_rate": 4.159157638959198e-05, + "loss": 1.2716, + "step": 11548 + }, + { + "epoch": 3.505539535589619, + "grad_norm": 0.9880130887031555, + "learning_rate": 4.1586514123721783e-05, + "loss": 0.7007, + "step": 11549 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 1.0184074640274048, + "learning_rate": 4.158145185785158e-05, + "loss": 0.801, + "step": 11550 + }, + { + "epoch": 3.506146607983002, + "grad_norm": 0.8864231109619141, + "learning_rate": 4.157638959198137e-05, + "loss": 1.2995, + "step": 11551 + }, + { + "epoch": 3.5064501441796936, + "grad_norm": 0.9638323783874512, + "learning_rate": 4.157132732611117e-05, + "loss": 0.68, + "step": 11552 + }, + { + "epoch": 3.506753680376385, + "grad_norm": 0.8574047088623047, + "learning_rate": 4.156626506024097e-05, + "loss": 1.2216, + "step": 11553 + }, + { + "epoch": 3.5070572165730765, + "grad_norm": 0.9776139259338379, + "learning_rate": 4.1561202794370764e-05, + "loss": 0.7914, + "step": 11554 + }, + { + "epoch": 3.5073607527697677, + "grad_norm": 1.1641311645507812, + "learning_rate": 4.155614052850056e-05, + "loss": 0.9242, + "step": 11555 + }, + { + "epoch": 3.5076642889664593, + "grad_norm": 1.1273466348648071, + "learning_rate": 4.1551078262630354e-05, + "loss": 0.9495, + "step": 11556 + }, + { + "epoch": 3.507967825163151, + "grad_norm": 0.9333158135414124, + "learning_rate": 4.154601599676015e-05, + "loss": 0.8506, + "step": 11557 + }, + { + "epoch": 3.508271361359842, + "grad_norm": 1.1422935724258423, + "learning_rate": 4.154095373088995e-05, + "loss": 0.7052, + "step": 11558 + }, + { + "epoch": 3.5085748975565334, + "grad_norm": 0.9645944833755493, + "learning_rate": 4.1535891465019745e-05, + "loss": 0.9235, + "step": 11559 + }, + { + "epoch": 3.508878433753225, + "grad_norm": 1.468064546585083, + "learning_rate": 4.153082919914954e-05, + "loss": 0.8042, + "step": 11560 + }, + { + "epoch": 3.5091819699499167, + "grad_norm": 1.0878000259399414, + "learning_rate": 4.1525766933279334e-05, + "loss": 0.7054, + "step": 11561 + }, + { + "epoch": 3.509485506146608, + "grad_norm": 0.9729339480400085, + "learning_rate": 4.1520704667409136e-05, + "loss": 1.2483, + "step": 11562 + }, + { + "epoch": 3.5097890423432996, + "grad_norm": 1.1070778369903564, + "learning_rate": 4.151564240153893e-05, + "loss": 1.1408, + "step": 11563 + }, + { + "epoch": 3.510092578539991, + "grad_norm": 1.3622183799743652, + "learning_rate": 4.1510580135668726e-05, + "loss": 0.688, + "step": 11564 + }, + { + "epoch": 3.5103961147366824, + "grad_norm": 1.1606196165084839, + "learning_rate": 4.150551786979852e-05, + "loss": 1.1113, + "step": 11565 + }, + { + "epoch": 3.5106996509333737, + "grad_norm": 0.9894608855247498, + "learning_rate": 4.1500455603928315e-05, + "loss": 0.6443, + "step": 11566 + }, + { + "epoch": 3.5110031871300653, + "grad_norm": 1.2046208381652832, + "learning_rate": 4.149539333805812e-05, + "loss": 0.9214, + "step": 11567 + }, + { + "epoch": 3.511306723326757, + "grad_norm": 1.0471440553665161, + "learning_rate": 4.149033107218792e-05, + "loss": 0.8968, + "step": 11568 + }, + { + "epoch": 3.511610259523448, + "grad_norm": 1.3555386066436768, + "learning_rate": 4.148526880631771e-05, + "loss": 1.0387, + "step": 11569 + }, + { + "epoch": 3.5119137957201394, + "grad_norm": 1.051173448562622, + "learning_rate": 4.148020654044751e-05, + "loss": 0.7926, + "step": 11570 + }, + { + "epoch": 3.512217331916831, + "grad_norm": 1.3176108598709106, + "learning_rate": 4.14751442745773e-05, + "loss": 0.7444, + "step": 11571 + }, + { + "epoch": 3.5125208681135227, + "grad_norm": 0.8911525011062622, + "learning_rate": 4.14700820087071e-05, + "loss": 1.279, + "step": 11572 + }, + { + "epoch": 3.512824404310214, + "grad_norm": 1.1382569074630737, + "learning_rate": 4.14650197428369e-05, + "loss": 0.618, + "step": 11573 + }, + { + "epoch": 3.5131279405069056, + "grad_norm": 1.1052045822143555, + "learning_rate": 4.1459957476966694e-05, + "loss": 0.7413, + "step": 11574 + }, + { + "epoch": 3.513431476703597, + "grad_norm": 0.9809271693229675, + "learning_rate": 4.145489521109649e-05, + "loss": 1.0505, + "step": 11575 + }, + { + "epoch": 3.5137350129002884, + "grad_norm": 0.9727836847305298, + "learning_rate": 4.1449832945226284e-05, + "loss": 0.7851, + "step": 11576 + }, + { + "epoch": 3.5140385490969797, + "grad_norm": 0.7727161645889282, + "learning_rate": 4.1444770679356085e-05, + "loss": 0.4431, + "step": 11577 + }, + { + "epoch": 3.5143420852936713, + "grad_norm": 1.1538465023040771, + "learning_rate": 4.143970841348588e-05, + "loss": 1.0176, + "step": 11578 + }, + { + "epoch": 3.514645621490363, + "grad_norm": 1.0509063005447388, + "learning_rate": 4.1434646147615675e-05, + "loss": 0.818, + "step": 11579 + }, + { + "epoch": 3.514949157687054, + "grad_norm": 0.8184328079223633, + "learning_rate": 4.142958388174547e-05, + "loss": 0.822, + "step": 11580 + }, + { + "epoch": 3.5152526938837454, + "grad_norm": 1.114806056022644, + "learning_rate": 4.1424521615875264e-05, + "loss": 0.636, + "step": 11581 + }, + { + "epoch": 3.515556230080437, + "grad_norm": 1.0318868160247803, + "learning_rate": 4.1419459350005066e-05, + "loss": 0.7519, + "step": 11582 + }, + { + "epoch": 3.5158597662771287, + "grad_norm": 1.1263445615768433, + "learning_rate": 4.141439708413486e-05, + "loss": 0.3719, + "step": 11583 + }, + { + "epoch": 3.51616330247382, + "grad_norm": 1.4899978637695312, + "learning_rate": 4.1409334818264655e-05, + "loss": 0.871, + "step": 11584 + }, + { + "epoch": 3.5164668386705116, + "grad_norm": 0.8651173114776611, + "learning_rate": 4.140427255239445e-05, + "loss": 0.6073, + "step": 11585 + }, + { + "epoch": 3.516770374867203, + "grad_norm": 0.6657924652099609, + "learning_rate": 4.1399210286524245e-05, + "loss": 0.9035, + "step": 11586 + }, + { + "epoch": 3.5170739110638944, + "grad_norm": 0.991733968257904, + "learning_rate": 4.1394148020654047e-05, + "loss": 0.7199, + "step": 11587 + }, + { + "epoch": 3.5173774472605857, + "grad_norm": 1.0055289268493652, + "learning_rate": 4.138908575478384e-05, + "loss": 0.5377, + "step": 11588 + }, + { + "epoch": 3.5176809834572773, + "grad_norm": 1.1510155200958252, + "learning_rate": 4.138402348891364e-05, + "loss": 0.8076, + "step": 11589 + }, + { + "epoch": 3.517984519653969, + "grad_norm": 0.8227588534355164, + "learning_rate": 4.137896122304344e-05, + "loss": 1.6266, + "step": 11590 + }, + { + "epoch": 3.51828805585066, + "grad_norm": 0.9144574403762817, + "learning_rate": 4.137389895717323e-05, + "loss": 0.5967, + "step": 11591 + }, + { + "epoch": 3.5185915920473514, + "grad_norm": 1.1394140720367432, + "learning_rate": 4.1368836691303034e-05, + "loss": 1.0172, + "step": 11592 + }, + { + "epoch": 3.518895128244043, + "grad_norm": 0.936918318271637, + "learning_rate": 4.136377442543283e-05, + "loss": 1.3813, + "step": 11593 + }, + { + "epoch": 3.5191986644407347, + "grad_norm": 1.1327282190322876, + "learning_rate": 4.1358712159562624e-05, + "loss": 0.9401, + "step": 11594 + }, + { + "epoch": 3.519502200637426, + "grad_norm": 1.0433191061019897, + "learning_rate": 4.135364989369242e-05, + "loss": 1.0838, + "step": 11595 + }, + { + "epoch": 3.5198057368341176, + "grad_norm": 1.0733057260513306, + "learning_rate": 4.134858762782221e-05, + "loss": 0.9442, + "step": 11596 + }, + { + "epoch": 3.520109273030809, + "grad_norm": 1.0711328983306885, + "learning_rate": 4.1343525361952015e-05, + "loss": 1.2619, + "step": 11597 + }, + { + "epoch": 3.5204128092275004, + "grad_norm": 0.8375397324562073, + "learning_rate": 4.133846309608181e-05, + "loss": 0.8579, + "step": 11598 + }, + { + "epoch": 3.5207163454241917, + "grad_norm": 1.2810218334197998, + "learning_rate": 4.1333400830211604e-05, + "loss": 0.7859, + "step": 11599 + }, + { + "epoch": 3.5210198816208833, + "grad_norm": 1.3691848516464233, + "learning_rate": 4.13283385643414e-05, + "loss": 0.9553, + "step": 11600 + }, + { + "epoch": 3.521323417817575, + "grad_norm": 0.9362868666648865, + "learning_rate": 4.1323276298471194e-05, + "loss": 1.23, + "step": 11601 + }, + { + "epoch": 3.521626954014266, + "grad_norm": 1.025485634803772, + "learning_rate": 4.1318214032600996e-05, + "loss": 0.9953, + "step": 11602 + }, + { + "epoch": 3.5219304902109574, + "grad_norm": 1.0449550151824951, + "learning_rate": 4.131315176673079e-05, + "loss": 0.6131, + "step": 11603 + }, + { + "epoch": 3.522234026407649, + "grad_norm": 1.237074851989746, + "learning_rate": 4.1308089500860585e-05, + "loss": 0.5954, + "step": 11604 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 1.0649689435958862, + "learning_rate": 4.130302723499038e-05, + "loss": 1.0741, + "step": 11605 + }, + { + "epoch": 3.522841098801032, + "grad_norm": 1.0349031686782837, + "learning_rate": 4.129796496912018e-05, + "loss": 0.9255, + "step": 11606 + }, + { + "epoch": 3.5231446349977236, + "grad_norm": 1.0876766443252563, + "learning_rate": 4.1292902703249976e-05, + "loss": 0.7351, + "step": 11607 + }, + { + "epoch": 3.523448171194415, + "grad_norm": 0.9588029384613037, + "learning_rate": 4.128784043737977e-05, + "loss": 0.8934, + "step": 11608 + }, + { + "epoch": 3.5237517073911064, + "grad_norm": 0.9372028112411499, + "learning_rate": 4.1282778171509566e-05, + "loss": 0.967, + "step": 11609 + }, + { + "epoch": 3.5240552435877976, + "grad_norm": 1.0971843004226685, + "learning_rate": 4.127771590563936e-05, + "loss": 0.8033, + "step": 11610 + }, + { + "epoch": 3.5243587797844893, + "grad_norm": 1.0889402627944946, + "learning_rate": 4.127265363976916e-05, + "loss": 0.9257, + "step": 11611 + }, + { + "epoch": 3.524662315981181, + "grad_norm": 1.1245019435882568, + "learning_rate": 4.1267591373898964e-05, + "loss": 0.7184, + "step": 11612 + }, + { + "epoch": 3.524965852177872, + "grad_norm": 0.8279440402984619, + "learning_rate": 4.126252910802876e-05, + "loss": 1.1031, + "step": 11613 + }, + { + "epoch": 3.525269388374564, + "grad_norm": 0.9139112830162048, + "learning_rate": 4.1257466842158553e-05, + "loss": 1.3667, + "step": 11614 + }, + { + "epoch": 3.525572924571255, + "grad_norm": 1.1071287393569946, + "learning_rate": 4.125240457628835e-05, + "loss": 0.7096, + "step": 11615 + }, + { + "epoch": 3.5258764607679467, + "grad_norm": 1.11435067653656, + "learning_rate": 4.124734231041815e-05, + "loss": 1.031, + "step": 11616 + }, + { + "epoch": 3.526179996964638, + "grad_norm": 1.1879462003707886, + "learning_rate": 4.1242280044547945e-05, + "loss": 0.9728, + "step": 11617 + }, + { + "epoch": 3.5264835331613296, + "grad_norm": 0.9365653991699219, + "learning_rate": 4.123721777867774e-05, + "loss": 0.6053, + "step": 11618 + }, + { + "epoch": 3.526787069358021, + "grad_norm": 0.9172948002815247, + "learning_rate": 4.1232155512807534e-05, + "loss": 1.0229, + "step": 11619 + }, + { + "epoch": 3.5270906055547124, + "grad_norm": 0.9319714903831482, + "learning_rate": 4.122709324693733e-05, + "loss": 0.9606, + "step": 11620 + }, + { + "epoch": 3.5273941417514036, + "grad_norm": 1.1065716743469238, + "learning_rate": 4.122203098106713e-05, + "loss": 0.7203, + "step": 11621 + }, + { + "epoch": 3.5276976779480953, + "grad_norm": 0.8528241515159607, + "learning_rate": 4.1216968715196925e-05, + "loss": 1.4332, + "step": 11622 + }, + { + "epoch": 3.528001214144787, + "grad_norm": 0.8970558047294617, + "learning_rate": 4.121190644932672e-05, + "loss": 0.8596, + "step": 11623 + }, + { + "epoch": 3.528304750341478, + "grad_norm": 1.0630778074264526, + "learning_rate": 4.1206844183456515e-05, + "loss": 1.0186, + "step": 11624 + }, + { + "epoch": 3.52860828653817, + "grad_norm": 1.1862809658050537, + "learning_rate": 4.120178191758631e-05, + "loss": 0.6389, + "step": 11625 + }, + { + "epoch": 3.528911822734861, + "grad_norm": 0.8416665196418762, + "learning_rate": 4.119671965171611e-05, + "loss": 1.4323, + "step": 11626 + }, + { + "epoch": 3.5292153589315527, + "grad_norm": 0.8950128555297852, + "learning_rate": 4.1191657385845906e-05, + "loss": 0.8696, + "step": 11627 + }, + { + "epoch": 3.529518895128244, + "grad_norm": 1.0043418407440186, + "learning_rate": 4.11865951199757e-05, + "loss": 1.231, + "step": 11628 + }, + { + "epoch": 3.5298224313249356, + "grad_norm": 1.304412603378296, + "learning_rate": 4.1181532854105496e-05, + "loss": 0.8209, + "step": 11629 + }, + { + "epoch": 3.530125967521627, + "grad_norm": 1.2766506671905518, + "learning_rate": 4.11764705882353e-05, + "loss": 0.9682, + "step": 11630 + }, + { + "epoch": 3.5304295037183184, + "grad_norm": 1.1540875434875488, + "learning_rate": 4.117140832236509e-05, + "loss": 1.1396, + "step": 11631 + }, + { + "epoch": 3.5307330399150096, + "grad_norm": 0.8981741070747375, + "learning_rate": 4.116634605649489e-05, + "loss": 1.1824, + "step": 11632 + }, + { + "epoch": 3.5310365761117013, + "grad_norm": 0.9279069304466248, + "learning_rate": 4.116128379062468e-05, + "loss": 0.7299, + "step": 11633 + }, + { + "epoch": 3.531340112308393, + "grad_norm": 1.1477733850479126, + "learning_rate": 4.115622152475448e-05, + "loss": 0.7281, + "step": 11634 + }, + { + "epoch": 3.531643648505084, + "grad_norm": 1.0965454578399658, + "learning_rate": 4.115115925888428e-05, + "loss": 1.1538, + "step": 11635 + }, + { + "epoch": 3.531947184701776, + "grad_norm": 0.9226409792900085, + "learning_rate": 4.114609699301408e-05, + "loss": 0.7281, + "step": 11636 + }, + { + "epoch": 3.532250720898467, + "grad_norm": 0.9386078715324402, + "learning_rate": 4.1141034727143874e-05, + "loss": 1.3893, + "step": 11637 + }, + { + "epoch": 3.5325542570951587, + "grad_norm": 0.8506246209144592, + "learning_rate": 4.113597246127367e-05, + "loss": 1.2293, + "step": 11638 + }, + { + "epoch": 3.53285779329185, + "grad_norm": 1.026960015296936, + "learning_rate": 4.1130910195403464e-05, + "loss": 0.8935, + "step": 11639 + }, + { + "epoch": 3.5331613294885416, + "grad_norm": 1.106721043586731, + "learning_rate": 4.112584792953326e-05, + "loss": 1.1717, + "step": 11640 + }, + { + "epoch": 3.533464865685233, + "grad_norm": 0.9380438327789307, + "learning_rate": 4.112078566366306e-05, + "loss": 1.1276, + "step": 11641 + }, + { + "epoch": 3.5337684018819244, + "grad_norm": 0.8998585939407349, + "learning_rate": 4.1115723397792855e-05, + "loss": 0.8254, + "step": 11642 + }, + { + "epoch": 3.5340719380786156, + "grad_norm": 1.0387241840362549, + "learning_rate": 4.111066113192265e-05, + "loss": 0.9171, + "step": 11643 + }, + { + "epoch": 3.5343754742753073, + "grad_norm": 1.150399088859558, + "learning_rate": 4.1105598866052445e-05, + "loss": 0.9431, + "step": 11644 + }, + { + "epoch": 3.534679010471999, + "grad_norm": 1.1193784475326538, + "learning_rate": 4.1100536600182246e-05, + "loss": 0.9782, + "step": 11645 + }, + { + "epoch": 3.53498254666869, + "grad_norm": 1.078013300895691, + "learning_rate": 4.109547433431204e-05, + "loss": 1.1689, + "step": 11646 + }, + { + "epoch": 3.535286082865382, + "grad_norm": 1.052385926246643, + "learning_rate": 4.1090412068441836e-05, + "loss": 0.8785, + "step": 11647 + }, + { + "epoch": 3.535589619062073, + "grad_norm": 0.9786132574081421, + "learning_rate": 4.108534980257163e-05, + "loss": 0.8227, + "step": 11648 + }, + { + "epoch": 3.5358931552587647, + "grad_norm": 0.8646959662437439, + "learning_rate": 4.1080287536701425e-05, + "loss": 0.7206, + "step": 11649 + }, + { + "epoch": 3.536196691455456, + "grad_norm": 0.8545046448707581, + "learning_rate": 4.107522527083123e-05, + "loss": 1.359, + "step": 11650 + }, + { + "epoch": 3.5365002276521476, + "grad_norm": 0.9969759583473206, + "learning_rate": 4.107016300496102e-05, + "loss": 0.839, + "step": 11651 + }, + { + "epoch": 3.536803763848839, + "grad_norm": 1.3190021514892578, + "learning_rate": 4.106510073909082e-05, + "loss": 1.0555, + "step": 11652 + }, + { + "epoch": 3.5371073000455304, + "grad_norm": 0.9127211570739746, + "learning_rate": 4.106003847322061e-05, + "loss": 1.2393, + "step": 11653 + }, + { + "epoch": 3.5374108362422216, + "grad_norm": 0.9713261127471924, + "learning_rate": 4.1054976207350406e-05, + "loss": 0.7892, + "step": 11654 + }, + { + "epoch": 3.5377143724389133, + "grad_norm": 1.0656861066818237, + "learning_rate": 4.104991394148021e-05, + "loss": 0.8826, + "step": 11655 + }, + { + "epoch": 3.538017908635605, + "grad_norm": 0.9511197209358215, + "learning_rate": 4.104485167561001e-05, + "loss": 0.9233, + "step": 11656 + }, + { + "epoch": 3.538321444832296, + "grad_norm": 0.7319124341011047, + "learning_rate": 4.1039789409739804e-05, + "loss": 1.3444, + "step": 11657 + }, + { + "epoch": 3.538624981028988, + "grad_norm": 0.905583918094635, + "learning_rate": 4.10347271438696e-05, + "loss": 0.768, + "step": 11658 + }, + { + "epoch": 3.538928517225679, + "grad_norm": 1.150198221206665, + "learning_rate": 4.1029664877999394e-05, + "loss": 0.7554, + "step": 11659 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.7048593163490295, + "learning_rate": 4.1024602612129195e-05, + "loss": 0.4503, + "step": 11660 + }, + { + "epoch": 3.539535589619062, + "grad_norm": 0.9462475180625916, + "learning_rate": 4.101954034625899e-05, + "loss": 1.2329, + "step": 11661 + }, + { + "epoch": 3.5398391258157536, + "grad_norm": 1.071959376335144, + "learning_rate": 4.1014478080388785e-05, + "loss": 1.0088, + "step": 11662 + }, + { + "epoch": 3.540142662012445, + "grad_norm": 0.9576846361160278, + "learning_rate": 4.100941581451858e-05, + "loss": 1.2331, + "step": 11663 + }, + { + "epoch": 3.5404461982091364, + "grad_norm": 0.9521212577819824, + "learning_rate": 4.1004353548648375e-05, + "loss": 1.0457, + "step": 11664 + }, + { + "epoch": 3.5407497344058276, + "grad_norm": 0.8998653292655945, + "learning_rate": 4.0999291282778176e-05, + "loss": 0.614, + "step": 11665 + }, + { + "epoch": 3.5410532706025193, + "grad_norm": 0.7305185794830322, + "learning_rate": 4.099422901690797e-05, + "loss": 0.499, + "step": 11666 + }, + { + "epoch": 3.541356806799211, + "grad_norm": 0.992877185344696, + "learning_rate": 4.0989166751037766e-05, + "loss": 0.905, + "step": 11667 + }, + { + "epoch": 3.541660342995902, + "grad_norm": 1.2652246952056885, + "learning_rate": 4.098410448516756e-05, + "loss": 0.6611, + "step": 11668 + }, + { + "epoch": 3.541963879192594, + "grad_norm": 0.9592330455780029, + "learning_rate": 4.097904221929736e-05, + "loss": 0.527, + "step": 11669 + }, + { + "epoch": 3.542267415389285, + "grad_norm": 0.9775657653808594, + "learning_rate": 4.097397995342716e-05, + "loss": 0.7883, + "step": 11670 + }, + { + "epoch": 3.5425709515859767, + "grad_norm": 0.9483558535575867, + "learning_rate": 4.096891768755695e-05, + "loss": 0.8846, + "step": 11671 + }, + { + "epoch": 3.542874487782668, + "grad_norm": 0.9615581631660461, + "learning_rate": 4.0963855421686746e-05, + "loss": 1.3313, + "step": 11672 + }, + { + "epoch": 3.5431780239793595, + "grad_norm": 0.7910706996917725, + "learning_rate": 4.095879315581654e-05, + "loss": 0.9975, + "step": 11673 + }, + { + "epoch": 3.543481560176051, + "grad_norm": 0.9117241501808167, + "learning_rate": 4.095373088994634e-05, + "loss": 0.8638, + "step": 11674 + }, + { + "epoch": 3.5437850963727424, + "grad_norm": 1.1577955484390259, + "learning_rate": 4.094866862407614e-05, + "loss": 0.8878, + "step": 11675 + }, + { + "epoch": 3.5440886325694336, + "grad_norm": 0.8803698420524597, + "learning_rate": 4.094360635820593e-05, + "loss": 0.7759, + "step": 11676 + }, + { + "epoch": 3.5443921687661253, + "grad_norm": 0.9368059039115906, + "learning_rate": 4.093854409233573e-05, + "loss": 1.5584, + "step": 11677 + }, + { + "epoch": 3.544695704962817, + "grad_norm": 1.024385929107666, + "learning_rate": 4.093348182646553e-05, + "loss": 1.2881, + "step": 11678 + }, + { + "epoch": 3.544999241159508, + "grad_norm": 1.2759578227996826, + "learning_rate": 4.0928419560595324e-05, + "loss": 0.5149, + "step": 11679 + }, + { + "epoch": 3.5453027773562, + "grad_norm": 0.9710344672203064, + "learning_rate": 4.0923357294725125e-05, + "loss": 0.8841, + "step": 11680 + }, + { + "epoch": 3.545606313552891, + "grad_norm": 0.9989772439002991, + "learning_rate": 4.091829502885492e-05, + "loss": 1.1351, + "step": 11681 + }, + { + "epoch": 3.5459098497495827, + "grad_norm": 1.0868676900863647, + "learning_rate": 4.0913232762984715e-05, + "loss": 0.9103, + "step": 11682 + }, + { + "epoch": 3.546213385946274, + "grad_norm": 1.174346685409546, + "learning_rate": 4.090817049711451e-05, + "loss": 1.0241, + "step": 11683 + }, + { + "epoch": 3.5465169221429655, + "grad_norm": 0.8900377154350281, + "learning_rate": 4.090310823124431e-05, + "loss": 1.1339, + "step": 11684 + }, + { + "epoch": 3.546820458339657, + "grad_norm": 1.3798744678497314, + "learning_rate": 4.0898045965374106e-05, + "loss": 0.6354, + "step": 11685 + }, + { + "epoch": 3.5471239945363484, + "grad_norm": 0.9964196085929871, + "learning_rate": 4.08929836995039e-05, + "loss": 0.5388, + "step": 11686 + }, + { + "epoch": 3.54742753073304, + "grad_norm": 1.1207791566848755, + "learning_rate": 4.0887921433633695e-05, + "loss": 0.8923, + "step": 11687 + }, + { + "epoch": 3.5477310669297313, + "grad_norm": 1.0456925630569458, + "learning_rate": 4.088285916776349e-05, + "loss": 0.9272, + "step": 11688 + }, + { + "epoch": 3.548034603126423, + "grad_norm": 1.2515175342559814, + "learning_rate": 4.087779690189329e-05, + "loss": 1.1531, + "step": 11689 + }, + { + "epoch": 3.548338139323114, + "grad_norm": 1.1480056047439575, + "learning_rate": 4.0872734636023087e-05, + "loss": 0.7701, + "step": 11690 + }, + { + "epoch": 3.548641675519806, + "grad_norm": 0.8538444638252258, + "learning_rate": 4.086767237015288e-05, + "loss": 1.3233, + "step": 11691 + }, + { + "epoch": 3.5489452117164975, + "grad_norm": 0.8730032444000244, + "learning_rate": 4.0862610104282676e-05, + "loss": 1.084, + "step": 11692 + }, + { + "epoch": 3.5492487479131887, + "grad_norm": 0.8308775424957275, + "learning_rate": 4.085754783841247e-05, + "loss": 0.6732, + "step": 11693 + }, + { + "epoch": 3.54955228410988, + "grad_norm": 1.2501428127288818, + "learning_rate": 4.085248557254227e-05, + "loss": 0.5583, + "step": 11694 + }, + { + "epoch": 3.5498558203065715, + "grad_norm": 0.8482654094696045, + "learning_rate": 4.084742330667207e-05, + "loss": 1.3252, + "step": 11695 + }, + { + "epoch": 3.550159356503263, + "grad_norm": 0.9812931418418884, + "learning_rate": 4.084236104080186e-05, + "loss": 1.1905, + "step": 11696 + }, + { + "epoch": 3.5504628926999544, + "grad_norm": 0.8222600221633911, + "learning_rate": 4.083729877493166e-05, + "loss": 0.3984, + "step": 11697 + }, + { + "epoch": 3.550766428896646, + "grad_norm": 0.9347089529037476, + "learning_rate": 4.083223650906146e-05, + "loss": 0.6439, + "step": 11698 + }, + { + "epoch": 3.5510699650933373, + "grad_norm": 1.017388105392456, + "learning_rate": 4.082717424319125e-05, + "loss": 1.1401, + "step": 11699 + }, + { + "epoch": 3.551373501290029, + "grad_norm": 1.0311392545700073, + "learning_rate": 4.082211197732105e-05, + "loss": 0.7581, + "step": 11700 + }, + { + "epoch": 3.55167703748672, + "grad_norm": 0.8759708404541016, + "learning_rate": 4.081704971145085e-05, + "loss": 0.8912, + "step": 11701 + }, + { + "epoch": 3.551980573683412, + "grad_norm": 1.0585432052612305, + "learning_rate": 4.0811987445580644e-05, + "loss": 0.6294, + "step": 11702 + }, + { + "epoch": 3.5522841098801035, + "grad_norm": 1.0939470529556274, + "learning_rate": 4.080692517971044e-05, + "loss": 0.755, + "step": 11703 + }, + { + "epoch": 3.5525876460767947, + "grad_norm": 1.2451399564743042, + "learning_rate": 4.080186291384024e-05, + "loss": 0.9862, + "step": 11704 + }, + { + "epoch": 3.552891182273486, + "grad_norm": 1.0536836385726929, + "learning_rate": 4.0796800647970036e-05, + "loss": 1.0147, + "step": 11705 + }, + { + "epoch": 3.5531947184701775, + "grad_norm": 0.9235849976539612, + "learning_rate": 4.079173838209983e-05, + "loss": 0.5516, + "step": 11706 + }, + { + "epoch": 3.553498254666869, + "grad_norm": 1.226388692855835, + "learning_rate": 4.0786676116229625e-05, + "loss": 0.9231, + "step": 11707 + }, + { + "epoch": 3.5538017908635604, + "grad_norm": 1.0994820594787598, + "learning_rate": 4.078161385035943e-05, + "loss": 1.1224, + "step": 11708 + }, + { + "epoch": 3.554105327060252, + "grad_norm": 1.0220235586166382, + "learning_rate": 4.077655158448922e-05, + "loss": 0.5849, + "step": 11709 + }, + { + "epoch": 3.5544088632569433, + "grad_norm": 0.9691071510314941, + "learning_rate": 4.0771489318619016e-05, + "loss": 0.6624, + "step": 11710 + }, + { + "epoch": 3.554712399453635, + "grad_norm": 1.0665960311889648, + "learning_rate": 4.076642705274881e-05, + "loss": 1.0031, + "step": 11711 + }, + { + "epoch": 3.555015935650326, + "grad_norm": 1.0929714441299438, + "learning_rate": 4.0761364786878606e-05, + "loss": 0.971, + "step": 11712 + }, + { + "epoch": 3.555319471847018, + "grad_norm": 1.045013189315796, + "learning_rate": 4.075630252100841e-05, + "loss": 0.5739, + "step": 11713 + }, + { + "epoch": 3.5556230080437095, + "grad_norm": 0.6971214413642883, + "learning_rate": 4.07512402551382e-05, + "loss": 0.8504, + "step": 11714 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.8493728637695312, + "learning_rate": 4.0746177989268e-05, + "loss": 0.8323, + "step": 11715 + }, + { + "epoch": 3.556230080437092, + "grad_norm": 0.7296162843704224, + "learning_rate": 4.074111572339779e-05, + "loss": 1.7127, + "step": 11716 + }, + { + "epoch": 3.5565336166337835, + "grad_norm": 1.337073802947998, + "learning_rate": 4.073605345752759e-05, + "loss": 1.0052, + "step": 11717 + }, + { + "epoch": 3.556837152830475, + "grad_norm": 1.0401408672332764, + "learning_rate": 4.073099119165739e-05, + "loss": 0.8159, + "step": 11718 + }, + { + "epoch": 3.5571406890271664, + "grad_norm": 1.1861757040023804, + "learning_rate": 4.072592892578718e-05, + "loss": 0.6847, + "step": 11719 + }, + { + "epoch": 3.557444225223858, + "grad_norm": 1.0336002111434937, + "learning_rate": 4.072086665991698e-05, + "loss": 1.3087, + "step": 11720 + }, + { + "epoch": 3.5577477614205493, + "grad_norm": 0.8979011178016663, + "learning_rate": 4.071580439404677e-05, + "loss": 1.3279, + "step": 11721 + }, + { + "epoch": 3.558051297617241, + "grad_norm": 1.0969467163085938, + "learning_rate": 4.0710742128176574e-05, + "loss": 1.2488, + "step": 11722 + }, + { + "epoch": 3.558354833813932, + "grad_norm": 1.0352734327316284, + "learning_rate": 4.0705679862306376e-05, + "loss": 0.858, + "step": 11723 + }, + { + "epoch": 3.558658370010624, + "grad_norm": 0.9439748525619507, + "learning_rate": 4.070061759643617e-05, + "loss": 0.9832, + "step": 11724 + }, + { + "epoch": 3.5589619062073155, + "grad_norm": 1.2412490844726562, + "learning_rate": 4.0695555330565965e-05, + "loss": 0.5824, + "step": 11725 + }, + { + "epoch": 3.5592654424040067, + "grad_norm": 0.9641359448432922, + "learning_rate": 4.069049306469576e-05, + "loss": 1.0533, + "step": 11726 + }, + { + "epoch": 3.559568978600698, + "grad_norm": 1.100813865661621, + "learning_rate": 4.0685430798825555e-05, + "loss": 1.4563, + "step": 11727 + }, + { + "epoch": 3.5598725147973895, + "grad_norm": 1.1757925748825073, + "learning_rate": 4.0680368532955357e-05, + "loss": 0.8884, + "step": 11728 + }, + { + "epoch": 3.560176050994081, + "grad_norm": 1.2507820129394531, + "learning_rate": 4.067530626708515e-05, + "loss": 0.8187, + "step": 11729 + }, + { + "epoch": 3.5604795871907724, + "grad_norm": 0.8340917825698853, + "learning_rate": 4.0670244001214946e-05, + "loss": 0.8718, + "step": 11730 + }, + { + "epoch": 3.560783123387464, + "grad_norm": 0.9683492183685303, + "learning_rate": 4.066518173534474e-05, + "loss": 0.4405, + "step": 11731 + }, + { + "epoch": 3.5610866595841553, + "grad_norm": 1.07754647731781, + "learning_rate": 4.0660119469474536e-05, + "loss": 1.0335, + "step": 11732 + }, + { + "epoch": 3.561390195780847, + "grad_norm": 1.1753009557724, + "learning_rate": 4.065505720360434e-05, + "loss": 0.826, + "step": 11733 + }, + { + "epoch": 3.561693731977538, + "grad_norm": 1.2171560525894165, + "learning_rate": 4.064999493773413e-05, + "loss": 0.964, + "step": 11734 + }, + { + "epoch": 3.56199726817423, + "grad_norm": 0.9547513723373413, + "learning_rate": 4.064493267186393e-05, + "loss": 1.2402, + "step": 11735 + }, + { + "epoch": 3.5623008043709214, + "grad_norm": 1.1694365739822388, + "learning_rate": 4.063987040599372e-05, + "loss": 0.528, + "step": 11736 + }, + { + "epoch": 3.5626043405676127, + "grad_norm": 1.253077507019043, + "learning_rate": 4.063480814012352e-05, + "loss": 0.9792, + "step": 11737 + }, + { + "epoch": 3.562907876764304, + "grad_norm": 1.1710174083709717, + "learning_rate": 4.062974587425332e-05, + "loss": 0.6814, + "step": 11738 + }, + { + "epoch": 3.5632114129609955, + "grad_norm": 1.0909475088119507, + "learning_rate": 4.062468360838311e-05, + "loss": 1.0917, + "step": 11739 + }, + { + "epoch": 3.563514949157687, + "grad_norm": 0.9481627345085144, + "learning_rate": 4.061962134251291e-05, + "loss": 1.3599, + "step": 11740 + }, + { + "epoch": 3.5638184853543784, + "grad_norm": 0.9830762147903442, + "learning_rate": 4.06145590766427e-05, + "loss": 1.0189, + "step": 11741 + }, + { + "epoch": 3.56412202155107, + "grad_norm": 0.9736356735229492, + "learning_rate": 4.0609496810772504e-05, + "loss": 0.8681, + "step": 11742 + }, + { + "epoch": 3.5644255577477613, + "grad_norm": 1.1125178337097168, + "learning_rate": 4.06044345449023e-05, + "loss": 0.5323, + "step": 11743 + }, + { + "epoch": 3.564729093944453, + "grad_norm": 0.8118406534194946, + "learning_rate": 4.0599372279032094e-05, + "loss": 0.5287, + "step": 11744 + }, + { + "epoch": 3.565032630141144, + "grad_norm": 0.843299388885498, + "learning_rate": 4.0594310013161895e-05, + "loss": 0.6565, + "step": 11745 + }, + { + "epoch": 3.565336166337836, + "grad_norm": 0.9035897254943848, + "learning_rate": 4.058924774729169e-05, + "loss": 1.2224, + "step": 11746 + }, + { + "epoch": 3.5656397025345274, + "grad_norm": 0.8780171871185303, + "learning_rate": 4.058418548142149e-05, + "loss": 0.7401, + "step": 11747 + }, + { + "epoch": 3.5659432387312187, + "grad_norm": 0.9170819520950317, + "learning_rate": 4.0579123215551286e-05, + "loss": 1.0428, + "step": 11748 + }, + { + "epoch": 3.5662467749279103, + "grad_norm": 1.1618040800094604, + "learning_rate": 4.057406094968108e-05, + "loss": 0.9617, + "step": 11749 + }, + { + "epoch": 3.5665503111246015, + "grad_norm": 1.238898515701294, + "learning_rate": 4.0568998683810876e-05, + "loss": 0.6535, + "step": 11750 + }, + { + "epoch": 3.566853847321293, + "grad_norm": 0.9754845499992371, + "learning_rate": 4.056393641794067e-05, + "loss": 0.8552, + "step": 11751 + }, + { + "epoch": 3.5671573835179844, + "grad_norm": 1.3999541997909546, + "learning_rate": 4.055887415207047e-05, + "loss": 1.2033, + "step": 11752 + }, + { + "epoch": 3.567460919714676, + "grad_norm": 0.937349259853363, + "learning_rate": 4.055381188620027e-05, + "loss": 0.8968, + "step": 11753 + }, + { + "epoch": 3.5677644559113677, + "grad_norm": 0.9076782464981079, + "learning_rate": 4.054874962033006e-05, + "loss": 1.0204, + "step": 11754 + }, + { + "epoch": 3.568067992108059, + "grad_norm": 0.8091506958007812, + "learning_rate": 4.054368735445986e-05, + "loss": 1.0718, + "step": 11755 + }, + { + "epoch": 3.56837152830475, + "grad_norm": 1.0529391765594482, + "learning_rate": 4.053862508858965e-05, + "loss": 0.8181, + "step": 11756 + }, + { + "epoch": 3.568675064501442, + "grad_norm": 1.1596744060516357, + "learning_rate": 4.053356282271945e-05, + "loss": 1.0346, + "step": 11757 + }, + { + "epoch": 3.5689786006981334, + "grad_norm": 0.7921033501625061, + "learning_rate": 4.052850055684925e-05, + "loss": 0.4932, + "step": 11758 + }, + { + "epoch": 3.5692821368948247, + "grad_norm": 0.9963080883026123, + "learning_rate": 4.052343829097904e-05, + "loss": 0.822, + "step": 11759 + }, + { + "epoch": 3.5695856730915163, + "grad_norm": 0.8828316330909729, + "learning_rate": 4.051837602510884e-05, + "loss": 1.2483, + "step": 11760 + }, + { + "epoch": 3.5698892092882075, + "grad_norm": 0.9907362461090088, + "learning_rate": 4.051331375923864e-05, + "loss": 1.0649, + "step": 11761 + }, + { + "epoch": 3.570192745484899, + "grad_norm": 1.062038779258728, + "learning_rate": 4.0508251493368434e-05, + "loss": 0.4807, + "step": 11762 + }, + { + "epoch": 3.5704962816815904, + "grad_norm": 1.1489720344543457, + "learning_rate": 4.050318922749823e-05, + "loss": 0.7864, + "step": 11763 + }, + { + "epoch": 3.570799817878282, + "grad_norm": 1.0680936574935913, + "learning_rate": 4.049812696162802e-05, + "loss": 0.9623, + "step": 11764 + }, + { + "epoch": 3.5711033540749737, + "grad_norm": 1.1594878435134888, + "learning_rate": 4.049306469575782e-05, + "loss": 0.9777, + "step": 11765 + }, + { + "epoch": 3.571406890271665, + "grad_norm": 1.1210532188415527, + "learning_rate": 4.048800242988762e-05, + "loss": 0.7153, + "step": 11766 + }, + { + "epoch": 3.571710426468356, + "grad_norm": 1.24998140335083, + "learning_rate": 4.048294016401742e-05, + "loss": 0.9661, + "step": 11767 + }, + { + "epoch": 3.572013962665048, + "grad_norm": 0.8914086222648621, + "learning_rate": 4.0477877898147216e-05, + "loss": 0.5626, + "step": 11768 + }, + { + "epoch": 3.5723174988617394, + "grad_norm": 1.233412504196167, + "learning_rate": 4.047281563227701e-05, + "loss": 0.9389, + "step": 11769 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 1.3267155885696411, + "learning_rate": 4.0467753366406806e-05, + "loss": 0.7851, + "step": 11770 + }, + { + "epoch": 3.5729245712551223, + "grad_norm": 1.0341217517852783, + "learning_rate": 4.04626911005366e-05, + "loss": 0.6631, + "step": 11771 + }, + { + "epoch": 3.5732281074518135, + "grad_norm": 1.137678861618042, + "learning_rate": 4.04576288346664e-05, + "loss": 0.9351, + "step": 11772 + }, + { + "epoch": 3.573531643648505, + "grad_norm": 0.7542721629142761, + "learning_rate": 4.04525665687962e-05, + "loss": 1.3512, + "step": 11773 + }, + { + "epoch": 3.5738351798451964, + "grad_norm": 0.9579821228981018, + "learning_rate": 4.044750430292599e-05, + "loss": 1.421, + "step": 11774 + }, + { + "epoch": 3.574138716041888, + "grad_norm": 1.0368013381958008, + "learning_rate": 4.0442442037055786e-05, + "loss": 0.761, + "step": 11775 + }, + { + "epoch": 3.5744422522385797, + "grad_norm": 1.1768851280212402, + "learning_rate": 4.043737977118559e-05, + "loss": 0.829, + "step": 11776 + }, + { + "epoch": 3.574745788435271, + "grad_norm": 1.0429997444152832, + "learning_rate": 4.043231750531538e-05, + "loss": 1.022, + "step": 11777 + }, + { + "epoch": 3.575049324631962, + "grad_norm": 1.068911075592041, + "learning_rate": 4.042725523944518e-05, + "loss": 1.2437, + "step": 11778 + }, + { + "epoch": 3.575352860828654, + "grad_norm": 1.2048927545547485, + "learning_rate": 4.042219297357497e-05, + "loss": 0.9431, + "step": 11779 + }, + { + "epoch": 3.5756563970253454, + "grad_norm": 1.1341795921325684, + "learning_rate": 4.041713070770477e-05, + "loss": 1.0646, + "step": 11780 + }, + { + "epoch": 3.5759599332220366, + "grad_norm": 0.8186066746711731, + "learning_rate": 4.041206844183457e-05, + "loss": 1.0714, + "step": 11781 + }, + { + "epoch": 3.5762634694187283, + "grad_norm": 1.1369121074676514, + "learning_rate": 4.0407006175964364e-05, + "loss": 1.1438, + "step": 11782 + }, + { + "epoch": 3.5765670056154195, + "grad_norm": 1.045667290687561, + "learning_rate": 4.040194391009416e-05, + "loss": 1.0004, + "step": 11783 + }, + { + "epoch": 3.576870541812111, + "grad_norm": 1.013298511505127, + "learning_rate": 4.039688164422395e-05, + "loss": 0.5399, + "step": 11784 + }, + { + "epoch": 3.5771740780088024, + "grad_norm": 0.8556250333786011, + "learning_rate": 4.039181937835375e-05, + "loss": 0.8242, + "step": 11785 + }, + { + "epoch": 3.577477614205494, + "grad_norm": 1.040802240371704, + "learning_rate": 4.038675711248355e-05, + "loss": 0.9041, + "step": 11786 + }, + { + "epoch": 3.5777811504021857, + "grad_norm": 1.103546380996704, + "learning_rate": 4.0381694846613344e-05, + "loss": 1.0424, + "step": 11787 + }, + { + "epoch": 3.578084686598877, + "grad_norm": 1.1253852844238281, + "learning_rate": 4.037663258074314e-05, + "loss": 1.0473, + "step": 11788 + }, + { + "epoch": 3.578388222795568, + "grad_norm": 1.0752980709075928, + "learning_rate": 4.037157031487294e-05, + "loss": 0.8329, + "step": 11789 + }, + { + "epoch": 3.5786917589922598, + "grad_norm": 1.3233833312988281, + "learning_rate": 4.0366508049002735e-05, + "loss": 1.1156, + "step": 11790 + }, + { + "epoch": 3.5789952951889514, + "grad_norm": 0.895180881023407, + "learning_rate": 4.036144578313254e-05, + "loss": 1.083, + "step": 11791 + }, + { + "epoch": 3.5792988313856426, + "grad_norm": 1.1815026998519897, + "learning_rate": 4.035638351726233e-05, + "loss": 0.5807, + "step": 11792 + }, + { + "epoch": 3.5796023675823343, + "grad_norm": 1.1306793689727783, + "learning_rate": 4.035132125139213e-05, + "loss": 0.9017, + "step": 11793 + }, + { + "epoch": 3.5799059037790255, + "grad_norm": 0.8197793364524841, + "learning_rate": 4.034625898552192e-05, + "loss": 0.7983, + "step": 11794 + }, + { + "epoch": 3.580209439975717, + "grad_norm": 1.0413057804107666, + "learning_rate": 4.0341196719651716e-05, + "loss": 1.2213, + "step": 11795 + }, + { + "epoch": 3.5805129761724084, + "grad_norm": 1.095939040184021, + "learning_rate": 4.033613445378152e-05, + "loss": 0.7021, + "step": 11796 + }, + { + "epoch": 3.5808165123691, + "grad_norm": 0.8002235293388367, + "learning_rate": 4.033107218791131e-05, + "loss": 1.041, + "step": 11797 + }, + { + "epoch": 3.5811200485657917, + "grad_norm": 1.1493903398513794, + "learning_rate": 4.032600992204111e-05, + "loss": 0.9643, + "step": 11798 + }, + { + "epoch": 3.581423584762483, + "grad_norm": 1.050647497177124, + "learning_rate": 4.03209476561709e-05, + "loss": 1.1067, + "step": 11799 + }, + { + "epoch": 3.581727120959174, + "grad_norm": 0.968075156211853, + "learning_rate": 4.0315885390300704e-05, + "loss": 1.2313, + "step": 11800 + }, + { + "epoch": 3.5820306571558658, + "grad_norm": 1.0812962055206299, + "learning_rate": 4.03108231244305e-05, + "loss": 0.7338, + "step": 11801 + }, + { + "epoch": 3.5823341933525574, + "grad_norm": 0.9601872563362122, + "learning_rate": 4.030576085856029e-05, + "loss": 1.1546, + "step": 11802 + }, + { + "epoch": 3.5826377295492486, + "grad_norm": 0.8790375590324402, + "learning_rate": 4.030069859269009e-05, + "loss": 1.655, + "step": 11803 + }, + { + "epoch": 3.5829412657459403, + "grad_norm": 0.7373253703117371, + "learning_rate": 4.029563632681988e-05, + "loss": 0.4492, + "step": 11804 + }, + { + "epoch": 3.5832448019426315, + "grad_norm": 0.8798004984855652, + "learning_rate": 4.0290574060949684e-05, + "loss": 0.7377, + "step": 11805 + }, + { + "epoch": 3.583548338139323, + "grad_norm": 1.1084705591201782, + "learning_rate": 4.028551179507948e-05, + "loss": 0.9164, + "step": 11806 + }, + { + "epoch": 3.5838518743360144, + "grad_norm": 1.196557641029358, + "learning_rate": 4.0280449529209274e-05, + "loss": 0.8813, + "step": 11807 + }, + { + "epoch": 3.584155410532706, + "grad_norm": 1.1060842275619507, + "learning_rate": 4.027538726333907e-05, + "loss": 0.6391, + "step": 11808 + }, + { + "epoch": 3.5844589467293977, + "grad_norm": 1.107773780822754, + "learning_rate": 4.0270324997468864e-05, + "loss": 0.8649, + "step": 11809 + }, + { + "epoch": 3.584762482926089, + "grad_norm": 0.9699903130531311, + "learning_rate": 4.0265262731598665e-05, + "loss": 1.3851, + "step": 11810 + }, + { + "epoch": 3.5850660191227806, + "grad_norm": 1.2007259130477905, + "learning_rate": 4.026020046572846e-05, + "loss": 0.976, + "step": 11811 + }, + { + "epoch": 3.5853695553194718, + "grad_norm": 0.919381320476532, + "learning_rate": 4.025513819985826e-05, + "loss": 1.1603, + "step": 11812 + }, + { + "epoch": 3.5856730915161634, + "grad_norm": 1.0891833305358887, + "learning_rate": 4.0250075933988056e-05, + "loss": 0.9378, + "step": 11813 + }, + { + "epoch": 3.5859766277128546, + "grad_norm": 1.1244471073150635, + "learning_rate": 4.024501366811785e-05, + "loss": 0.8003, + "step": 11814 + }, + { + "epoch": 3.5862801639095463, + "grad_norm": 0.9363517761230469, + "learning_rate": 4.023995140224765e-05, + "loss": 0.6852, + "step": 11815 + }, + { + "epoch": 3.5865837001062375, + "grad_norm": 0.8760140538215637, + "learning_rate": 4.023488913637745e-05, + "loss": 1.0153, + "step": 11816 + }, + { + "epoch": 3.586887236302929, + "grad_norm": 0.9312597513198853, + "learning_rate": 4.022982687050724e-05, + "loss": 1.4107, + "step": 11817 + }, + { + "epoch": 3.5871907724996204, + "grad_norm": 1.1470609903335571, + "learning_rate": 4.022476460463704e-05, + "loss": 0.9341, + "step": 11818 + }, + { + "epoch": 3.587494308696312, + "grad_norm": 1.4010385274887085, + "learning_rate": 4.021970233876683e-05, + "loss": 0.8192, + "step": 11819 + }, + { + "epoch": 3.5877978448930037, + "grad_norm": 1.0478214025497437, + "learning_rate": 4.0214640072896634e-05, + "loss": 1.1977, + "step": 11820 + }, + { + "epoch": 3.588101381089695, + "grad_norm": 1.1353591680526733, + "learning_rate": 4.020957780702643e-05, + "loss": 0.9412, + "step": 11821 + }, + { + "epoch": 3.5884049172863866, + "grad_norm": 1.0766994953155518, + "learning_rate": 4.020451554115622e-05, + "loss": 1.2963, + "step": 11822 + }, + { + "epoch": 3.5887084534830778, + "grad_norm": 1.2223659753799438, + "learning_rate": 4.019945327528602e-05, + "loss": 0.9088, + "step": 11823 + }, + { + "epoch": 3.5890119896797694, + "grad_norm": 1.0085211992263794, + "learning_rate": 4.019439100941581e-05, + "loss": 0.9986, + "step": 11824 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 1.0306519269943237, + "learning_rate": 4.0189328743545614e-05, + "loss": 1.0423, + "step": 11825 + }, + { + "epoch": 3.5896190620731523, + "grad_norm": 0.999062180519104, + "learning_rate": 4.018426647767541e-05, + "loss": 0.7243, + "step": 11826 + }, + { + "epoch": 3.589922598269844, + "grad_norm": 0.6834278702735901, + "learning_rate": 4.0179204211805204e-05, + "loss": 0.7771, + "step": 11827 + }, + { + "epoch": 3.590226134466535, + "grad_norm": 1.1712239980697632, + "learning_rate": 4.0174141945935e-05, + "loss": 0.9173, + "step": 11828 + }, + { + "epoch": 3.5905296706632264, + "grad_norm": 0.9996203780174255, + "learning_rate": 4.01690796800648e-05, + "loss": 0.9988, + "step": 11829 + }, + { + "epoch": 3.590833206859918, + "grad_norm": 1.3381606340408325, + "learning_rate": 4.0164017414194595e-05, + "loss": 0.8533, + "step": 11830 + }, + { + "epoch": 3.5911367430566097, + "grad_norm": 1.0494894981384277, + "learning_rate": 4.015895514832439e-05, + "loss": 0.4645, + "step": 11831 + }, + { + "epoch": 3.591440279253301, + "grad_norm": 1.437619686126709, + "learning_rate": 4.0153892882454185e-05, + "loss": 1.0605, + "step": 11832 + }, + { + "epoch": 3.5917438154499925, + "grad_norm": 0.9603704214096069, + "learning_rate": 4.014883061658398e-05, + "loss": 0.8602, + "step": 11833 + }, + { + "epoch": 3.5920473516466838, + "grad_norm": 0.8848273158073425, + "learning_rate": 4.014376835071378e-05, + "loss": 0.7956, + "step": 11834 + }, + { + "epoch": 3.5923508878433754, + "grad_norm": 0.820307195186615, + "learning_rate": 4.013870608484358e-05, + "loss": 0.5538, + "step": 11835 + }, + { + "epoch": 3.5926544240400666, + "grad_norm": 1.3730939626693726, + "learning_rate": 4.013364381897338e-05, + "loss": 0.8666, + "step": 11836 + }, + { + "epoch": 3.5929579602367583, + "grad_norm": 0.9075584411621094, + "learning_rate": 4.012858155310317e-05, + "loss": 0.7418, + "step": 11837 + }, + { + "epoch": 3.59326149643345, + "grad_norm": 1.0014084577560425, + "learning_rate": 4.012351928723297e-05, + "loss": 0.7762, + "step": 11838 + }, + { + "epoch": 3.593565032630141, + "grad_norm": 1.0412254333496094, + "learning_rate": 4.011845702136277e-05, + "loss": 0.708, + "step": 11839 + }, + { + "epoch": 3.5938685688268324, + "grad_norm": 0.79756098985672, + "learning_rate": 4.011339475549256e-05, + "loss": 1.4871, + "step": 11840 + }, + { + "epoch": 3.594172105023524, + "grad_norm": 1.0557396411895752, + "learning_rate": 4.010833248962236e-05, + "loss": 1.0165, + "step": 11841 + }, + { + "epoch": 3.5944756412202157, + "grad_norm": 0.8462033867835999, + "learning_rate": 4.010327022375215e-05, + "loss": 0.8688, + "step": 11842 + }, + { + "epoch": 3.594779177416907, + "grad_norm": 0.904782235622406, + "learning_rate": 4.009820795788195e-05, + "loss": 1.1974, + "step": 11843 + }, + { + "epoch": 3.5950827136135985, + "grad_norm": 1.0727719068527222, + "learning_rate": 4.009314569201175e-05, + "loss": 0.6736, + "step": 11844 + }, + { + "epoch": 3.5953862498102898, + "grad_norm": 1.1625694036483765, + "learning_rate": 4.0088083426141544e-05, + "loss": 0.7864, + "step": 11845 + }, + { + "epoch": 3.5956897860069814, + "grad_norm": 0.6735413074493408, + "learning_rate": 4.008302116027134e-05, + "loss": 1.7421, + "step": 11846 + }, + { + "epoch": 3.5959933222036726, + "grad_norm": 1.0197535753250122, + "learning_rate": 4.0077958894401134e-05, + "loss": 0.6592, + "step": 11847 + }, + { + "epoch": 3.5962968584003643, + "grad_norm": 1.229761004447937, + "learning_rate": 4.007289662853093e-05, + "loss": 0.9531, + "step": 11848 + }, + { + "epoch": 3.596600394597056, + "grad_norm": 1.3035818338394165, + "learning_rate": 4.006783436266073e-05, + "loss": 0.6248, + "step": 11849 + }, + { + "epoch": 3.596903930793747, + "grad_norm": 1.0814355611801147, + "learning_rate": 4.0062772096790525e-05, + "loss": 1.2993, + "step": 11850 + }, + { + "epoch": 3.5972074669904384, + "grad_norm": 0.9712275862693787, + "learning_rate": 4.005770983092032e-05, + "loss": 1.2693, + "step": 11851 + }, + { + "epoch": 3.59751100318713, + "grad_norm": 0.9117130637168884, + "learning_rate": 4.0052647565050114e-05, + "loss": 1.1846, + "step": 11852 + }, + { + "epoch": 3.5978145393838217, + "grad_norm": 1.3977762460708618, + "learning_rate": 4.0047585299179916e-05, + "loss": 0.6314, + "step": 11853 + }, + { + "epoch": 3.598118075580513, + "grad_norm": 1.0419931411743164, + "learning_rate": 4.004252303330971e-05, + "loss": 0.8202, + "step": 11854 + }, + { + "epoch": 3.5984216117772045, + "grad_norm": 1.1324694156646729, + "learning_rate": 4.0037460767439506e-05, + "loss": 0.9609, + "step": 11855 + }, + { + "epoch": 3.5987251479738958, + "grad_norm": 0.8851795196533203, + "learning_rate": 4.003239850156931e-05, + "loss": 0.6204, + "step": 11856 + }, + { + "epoch": 3.5990286841705874, + "grad_norm": 1.2004474401474, + "learning_rate": 4.00273362356991e-05, + "loss": 0.5828, + "step": 11857 + }, + { + "epoch": 3.5993322203672786, + "grad_norm": 1.026055932044983, + "learning_rate": 4.00222739698289e-05, + "loss": 1.1212, + "step": 11858 + }, + { + "epoch": 3.5996357565639703, + "grad_norm": 0.9376639723777771, + "learning_rate": 4.00172117039587e-05, + "loss": 0.6646, + "step": 11859 + }, + { + "epoch": 3.599939292760662, + "grad_norm": 1.2695647478103638, + "learning_rate": 4.001214943808849e-05, + "loss": 1.0429, + "step": 11860 + }, + { + "epoch": 3.600242828957353, + "grad_norm": 0.9345374703407288, + "learning_rate": 4.000708717221829e-05, + "loss": 0.7091, + "step": 11861 + }, + { + "epoch": 3.6005463651540444, + "grad_norm": 1.0844401121139526, + "learning_rate": 4.000202490634808e-05, + "loss": 1.2186, + "step": 11862 + }, + { + "epoch": 3.600849901350736, + "grad_norm": 0.877917468547821, + "learning_rate": 3.999696264047788e-05, + "loss": 0.6208, + "step": 11863 + }, + { + "epoch": 3.6011534375474277, + "grad_norm": 0.7851859927177429, + "learning_rate": 3.999190037460768e-05, + "loss": 0.9501, + "step": 11864 + }, + { + "epoch": 3.601456973744119, + "grad_norm": 1.137346863746643, + "learning_rate": 3.9986838108737474e-05, + "loss": 0.5643, + "step": 11865 + }, + { + "epoch": 3.6017605099408105, + "grad_norm": 1.037071704864502, + "learning_rate": 3.998177584286727e-05, + "loss": 0.7844, + "step": 11866 + }, + { + "epoch": 3.6020640461375018, + "grad_norm": 0.8675159811973572, + "learning_rate": 3.9976713576997063e-05, + "loss": 0.7246, + "step": 11867 + }, + { + "epoch": 3.6023675823341934, + "grad_norm": 1.2127631902694702, + "learning_rate": 3.9971651311126865e-05, + "loss": 0.6128, + "step": 11868 + }, + { + "epoch": 3.6026711185308846, + "grad_norm": 1.0057828426361084, + "learning_rate": 3.996658904525666e-05, + "loss": 0.7259, + "step": 11869 + }, + { + "epoch": 3.6029746547275763, + "grad_norm": 1.0663315057754517, + "learning_rate": 3.9961526779386455e-05, + "loss": 0.8426, + "step": 11870 + }, + { + "epoch": 3.603278190924268, + "grad_norm": 1.1992449760437012, + "learning_rate": 3.995646451351625e-05, + "loss": 0.6921, + "step": 11871 + }, + { + "epoch": 3.603581727120959, + "grad_norm": 0.9856591820716858, + "learning_rate": 3.9951402247646044e-05, + "loss": 1.1888, + "step": 11872 + }, + { + "epoch": 3.6038852633176504, + "grad_norm": 1.100213646888733, + "learning_rate": 3.9946339981775846e-05, + "loss": 0.531, + "step": 11873 + }, + { + "epoch": 3.604188799514342, + "grad_norm": 0.5835770964622498, + "learning_rate": 3.994127771590564e-05, + "loss": 1.088, + "step": 11874 + }, + { + "epoch": 3.6044923357110337, + "grad_norm": 1.1424405574798584, + "learning_rate": 3.9936215450035435e-05, + "loss": 0.6572, + "step": 11875 + }, + { + "epoch": 3.604795871907725, + "grad_norm": 1.0458776950836182, + "learning_rate": 3.993115318416523e-05, + "loss": 1.0433, + "step": 11876 + }, + { + "epoch": 3.6050994081044165, + "grad_norm": 0.9838107228279114, + "learning_rate": 3.9926090918295025e-05, + "loss": 0.8924, + "step": 11877 + }, + { + "epoch": 3.6054029443011077, + "grad_norm": 1.2607207298278809, + "learning_rate": 3.992102865242483e-05, + "loss": 0.7741, + "step": 11878 + }, + { + "epoch": 3.6057064804977994, + "grad_norm": 1.0495408773422241, + "learning_rate": 3.991596638655463e-05, + "loss": 1.1024, + "step": 11879 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 1.2188318967819214, + "learning_rate": 3.991090412068442e-05, + "loss": 0.8565, + "step": 11880 + }, + { + "epoch": 3.6063135528911823, + "grad_norm": 1.0403095483779907, + "learning_rate": 3.990584185481422e-05, + "loss": 1.1435, + "step": 11881 + }, + { + "epoch": 3.606617089087874, + "grad_norm": 1.2030267715454102, + "learning_rate": 3.990077958894401e-05, + "loss": 0.9283, + "step": 11882 + }, + { + "epoch": 3.606920625284565, + "grad_norm": 0.816705584526062, + "learning_rate": 3.9895717323073814e-05, + "loss": 0.9195, + "step": 11883 + }, + { + "epoch": 3.607224161481257, + "grad_norm": 1.0475828647613525, + "learning_rate": 3.989065505720361e-05, + "loss": 1.3358, + "step": 11884 + }, + { + "epoch": 3.607527697677948, + "grad_norm": 1.0253729820251465, + "learning_rate": 3.9885592791333404e-05, + "loss": 0.7581, + "step": 11885 + }, + { + "epoch": 3.6078312338746397, + "grad_norm": 0.9679356217384338, + "learning_rate": 3.98805305254632e-05, + "loss": 1.147, + "step": 11886 + }, + { + "epoch": 3.608134770071331, + "grad_norm": 0.9187350273132324, + "learning_rate": 3.987546825959299e-05, + "loss": 1.2669, + "step": 11887 + }, + { + "epoch": 3.6084383062680225, + "grad_norm": 0.5598514080047607, + "learning_rate": 3.9870405993722795e-05, + "loss": 0.8098, + "step": 11888 + }, + { + "epoch": 3.608741842464714, + "grad_norm": 0.9313917756080627, + "learning_rate": 3.986534372785259e-05, + "loss": 0.9218, + "step": 11889 + }, + { + "epoch": 3.6090453786614054, + "grad_norm": 0.9473603963851929, + "learning_rate": 3.9860281461982384e-05, + "loss": 1.241, + "step": 11890 + }, + { + "epoch": 3.6093489148580966, + "grad_norm": 1.0664430856704712, + "learning_rate": 3.985521919611218e-05, + "loss": 0.9809, + "step": 11891 + }, + { + "epoch": 3.6096524510547883, + "grad_norm": 1.0459421873092651, + "learning_rate": 3.985015693024198e-05, + "loss": 1.0754, + "step": 11892 + }, + { + "epoch": 3.60995598725148, + "grad_norm": 0.7948506474494934, + "learning_rate": 3.9845094664371775e-05, + "loss": 1.398, + "step": 11893 + }, + { + "epoch": 3.610259523448171, + "grad_norm": 1.124526858329773, + "learning_rate": 3.984003239850157e-05, + "loss": 0.5873, + "step": 11894 + }, + { + "epoch": 3.610563059644863, + "grad_norm": 1.4024449586868286, + "learning_rate": 3.9834970132631365e-05, + "loss": 0.866, + "step": 11895 + }, + { + "epoch": 3.610866595841554, + "grad_norm": 1.2529752254486084, + "learning_rate": 3.982990786676116e-05, + "loss": 0.9286, + "step": 11896 + }, + { + "epoch": 3.6111701320382457, + "grad_norm": 1.049893856048584, + "learning_rate": 3.982484560089096e-05, + "loss": 1.1412, + "step": 11897 + }, + { + "epoch": 3.611473668234937, + "grad_norm": 1.1598997116088867, + "learning_rate": 3.9819783335020756e-05, + "loss": 0.8983, + "step": 11898 + }, + { + "epoch": 3.6117772044316285, + "grad_norm": 0.9188267588615417, + "learning_rate": 3.981472106915055e-05, + "loss": 0.7377, + "step": 11899 + }, + { + "epoch": 3.61208074062832, + "grad_norm": 1.0305477380752563, + "learning_rate": 3.9809658803280346e-05, + "loss": 1.0122, + "step": 11900 + }, + { + "epoch": 3.6123842768250114, + "grad_norm": 0.9974192976951599, + "learning_rate": 3.980459653741015e-05, + "loss": 1.3315, + "step": 11901 + }, + { + "epoch": 3.6126878130217026, + "grad_norm": 1.0644034147262573, + "learning_rate": 3.979953427153994e-05, + "loss": 1.15, + "step": 11902 + }, + { + "epoch": 3.6129913492183943, + "grad_norm": 1.2176085710525513, + "learning_rate": 3.9794472005669744e-05, + "loss": 0.9106, + "step": 11903 + }, + { + "epoch": 3.613294885415086, + "grad_norm": 0.663973331451416, + "learning_rate": 3.978940973979954e-05, + "loss": 1.4712, + "step": 11904 + }, + { + "epoch": 3.613598421611777, + "grad_norm": 1.0275557041168213, + "learning_rate": 3.978434747392933e-05, + "loss": 0.6661, + "step": 11905 + }, + { + "epoch": 3.613901957808469, + "grad_norm": 1.0963793992996216, + "learning_rate": 3.977928520805913e-05, + "loss": 0.6313, + "step": 11906 + }, + { + "epoch": 3.61420549400516, + "grad_norm": 0.9998677372932434, + "learning_rate": 3.977422294218893e-05, + "loss": 0.6707, + "step": 11907 + }, + { + "epoch": 3.6145090302018517, + "grad_norm": 0.8151798248291016, + "learning_rate": 3.9769160676318725e-05, + "loss": 0.2526, + "step": 11908 + }, + { + "epoch": 3.614812566398543, + "grad_norm": 1.2791935205459595, + "learning_rate": 3.976409841044852e-05, + "loss": 1.2082, + "step": 11909 + }, + { + "epoch": 3.6151161025952345, + "grad_norm": 0.9526668190956116, + "learning_rate": 3.9759036144578314e-05, + "loss": 0.823, + "step": 11910 + }, + { + "epoch": 3.615419638791926, + "grad_norm": 1.0068575143814087, + "learning_rate": 3.975397387870811e-05, + "loss": 1.2525, + "step": 11911 + }, + { + "epoch": 3.6157231749886174, + "grad_norm": 1.2047441005706787, + "learning_rate": 3.974891161283791e-05, + "loss": 0.6719, + "step": 11912 + }, + { + "epoch": 3.6160267111853086, + "grad_norm": 1.2417141199111938, + "learning_rate": 3.9743849346967705e-05, + "loss": 0.8964, + "step": 11913 + }, + { + "epoch": 3.6163302473820003, + "grad_norm": 0.9624064564704895, + "learning_rate": 3.97387870810975e-05, + "loss": 1.0136, + "step": 11914 + }, + { + "epoch": 3.616633783578692, + "grad_norm": 1.1036078929901123, + "learning_rate": 3.9733724815227295e-05, + "loss": 1.0755, + "step": 11915 + }, + { + "epoch": 3.616937319775383, + "grad_norm": 1.100644588470459, + "learning_rate": 3.972866254935709e-05, + "loss": 1.0249, + "step": 11916 + }, + { + "epoch": 3.617240855972075, + "grad_norm": 1.078944444656372, + "learning_rate": 3.972360028348689e-05, + "loss": 0.9462, + "step": 11917 + }, + { + "epoch": 3.617544392168766, + "grad_norm": 1.0892934799194336, + "learning_rate": 3.9718538017616686e-05, + "loss": 0.8617, + "step": 11918 + }, + { + "epoch": 3.6178479283654577, + "grad_norm": 1.065589189529419, + "learning_rate": 3.971347575174648e-05, + "loss": 0.6647, + "step": 11919 + }, + { + "epoch": 3.618151464562149, + "grad_norm": 1.1586716175079346, + "learning_rate": 3.9708413485876276e-05, + "loss": 1.0423, + "step": 11920 + }, + { + "epoch": 3.6184550007588405, + "grad_norm": 1.055550217628479, + "learning_rate": 3.970335122000608e-05, + "loss": 0.9072, + "step": 11921 + }, + { + "epoch": 3.618758536955532, + "grad_norm": 1.1805459260940552, + "learning_rate": 3.969828895413587e-05, + "loss": 0.8216, + "step": 11922 + }, + { + "epoch": 3.6190620731522234, + "grad_norm": 1.265092372894287, + "learning_rate": 3.9693226688265674e-05, + "loss": 0.8606, + "step": 11923 + }, + { + "epoch": 3.6193656093489146, + "grad_norm": 1.2067534923553467, + "learning_rate": 3.968816442239547e-05, + "loss": 1.0322, + "step": 11924 + }, + { + "epoch": 3.6196691455456063, + "grad_norm": 1.1063047647476196, + "learning_rate": 3.968310215652526e-05, + "loss": 0.9882, + "step": 11925 + }, + { + "epoch": 3.619972681742298, + "grad_norm": 1.066673994064331, + "learning_rate": 3.967803989065506e-05, + "loss": 1.3239, + "step": 11926 + }, + { + "epoch": 3.620276217938989, + "grad_norm": 0.9495149254798889, + "learning_rate": 3.967297762478486e-05, + "loss": 1.2549, + "step": 11927 + }, + { + "epoch": 3.620579754135681, + "grad_norm": 0.7284690141677856, + "learning_rate": 3.9667915358914654e-05, + "loss": 1.0705, + "step": 11928 + }, + { + "epoch": 3.620883290332372, + "grad_norm": 0.9695112705230713, + "learning_rate": 3.966285309304445e-05, + "loss": 1.2445, + "step": 11929 + }, + { + "epoch": 3.6211868265290637, + "grad_norm": 0.9724299311637878, + "learning_rate": 3.9657790827174244e-05, + "loss": 0.8784, + "step": 11930 + }, + { + "epoch": 3.621490362725755, + "grad_norm": 0.7860652804374695, + "learning_rate": 3.9652728561304045e-05, + "loss": 0.4055, + "step": 11931 + }, + { + "epoch": 3.6217938989224465, + "grad_norm": 0.7782607078552246, + "learning_rate": 3.964766629543384e-05, + "loss": 0.8186, + "step": 11932 + }, + { + "epoch": 3.622097435119138, + "grad_norm": 1.053109884262085, + "learning_rate": 3.9642604029563635e-05, + "loss": 0.8016, + "step": 11933 + }, + { + "epoch": 3.6224009713158294, + "grad_norm": 1.4515295028686523, + "learning_rate": 3.963754176369343e-05, + "loss": 0.928, + "step": 11934 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.9363054037094116, + "learning_rate": 3.9632479497823225e-05, + "loss": 0.6195, + "step": 11935 + }, + { + "epoch": 3.6230080437092123, + "grad_norm": 1.2641396522521973, + "learning_rate": 3.9627417231953026e-05, + "loss": 0.7346, + "step": 11936 + }, + { + "epoch": 3.623311579905904, + "grad_norm": 1.0036572217941284, + "learning_rate": 3.962235496608282e-05, + "loss": 0.9018, + "step": 11937 + }, + { + "epoch": 3.623615116102595, + "grad_norm": 0.9050203561782837, + "learning_rate": 3.9617292700212616e-05, + "loss": 1.5189, + "step": 11938 + }, + { + "epoch": 3.623918652299287, + "grad_norm": 1.2951507568359375, + "learning_rate": 3.961223043434241e-05, + "loss": 0.6351, + "step": 11939 + }, + { + "epoch": 3.624222188495978, + "grad_norm": 1.0796775817871094, + "learning_rate": 3.9607168168472205e-05, + "loss": 0.8242, + "step": 11940 + }, + { + "epoch": 3.6245257246926696, + "grad_norm": 0.980072021484375, + "learning_rate": 3.960210590260201e-05, + "loss": 1.0325, + "step": 11941 + }, + { + "epoch": 3.624829260889361, + "grad_norm": 0.9182207584381104, + "learning_rate": 3.95970436367318e-05, + "loss": 0.4401, + "step": 11942 + }, + { + "epoch": 3.6251327970860525, + "grad_norm": 0.8907768130302429, + "learning_rate": 3.9591981370861597e-05, + "loss": 1.2621, + "step": 11943 + }, + { + "epoch": 3.625436333282744, + "grad_norm": 0.9926165342330933, + "learning_rate": 3.958691910499139e-05, + "loss": 1.1719, + "step": 11944 + }, + { + "epoch": 3.6257398694794354, + "grad_norm": 1.0810649394989014, + "learning_rate": 3.958185683912119e-05, + "loss": 0.763, + "step": 11945 + }, + { + "epoch": 3.626043405676127, + "grad_norm": 0.9303694367408752, + "learning_rate": 3.9576794573250994e-05, + "loss": 1.2431, + "step": 11946 + }, + { + "epoch": 3.6263469418728183, + "grad_norm": 0.9502303004264832, + "learning_rate": 3.957173230738079e-05, + "loss": 0.6175, + "step": 11947 + }, + { + "epoch": 3.62665047806951, + "grad_norm": 0.8279978036880493, + "learning_rate": 3.9566670041510584e-05, + "loss": 1.1286, + "step": 11948 + }, + { + "epoch": 3.626954014266201, + "grad_norm": 1.1536316871643066, + "learning_rate": 3.956160777564038e-05, + "loss": 0.9877, + "step": 11949 + }, + { + "epoch": 3.6272575504628928, + "grad_norm": 1.0675565004348755, + "learning_rate": 3.9556545509770174e-05, + "loss": 0.7663, + "step": 11950 + }, + { + "epoch": 3.6275610866595844, + "grad_norm": 0.7724364995956421, + "learning_rate": 3.9551483243899975e-05, + "loss": 1.122, + "step": 11951 + }, + { + "epoch": 3.6278646228562756, + "grad_norm": 1.1702176332473755, + "learning_rate": 3.954642097802977e-05, + "loss": 0.4571, + "step": 11952 + }, + { + "epoch": 3.628168159052967, + "grad_norm": 1.0796974897384644, + "learning_rate": 3.9541358712159565e-05, + "loss": 0.6647, + "step": 11953 + }, + { + "epoch": 3.6284716952496585, + "grad_norm": 1.1795870065689087, + "learning_rate": 3.953629644628936e-05, + "loss": 0.9218, + "step": 11954 + }, + { + "epoch": 3.62877523144635, + "grad_norm": 0.7589983940124512, + "learning_rate": 3.9531234180419154e-05, + "loss": 1.4991, + "step": 11955 + }, + { + "epoch": 3.6290787676430414, + "grad_norm": 0.8205420970916748, + "learning_rate": 3.9526171914548956e-05, + "loss": 0.5455, + "step": 11956 + }, + { + "epoch": 3.629382303839733, + "grad_norm": 0.9213430881500244, + "learning_rate": 3.952110964867875e-05, + "loss": 1.4167, + "step": 11957 + }, + { + "epoch": 3.6296858400364242, + "grad_norm": 1.1488837003707886, + "learning_rate": 3.9516047382808546e-05, + "loss": 0.6635, + "step": 11958 + }, + { + "epoch": 3.629989376233116, + "grad_norm": 0.8026210069656372, + "learning_rate": 3.951098511693834e-05, + "loss": 1.3653, + "step": 11959 + }, + { + "epoch": 3.630292912429807, + "grad_norm": 0.8148407340049744, + "learning_rate": 3.950592285106814e-05, + "loss": 0.4883, + "step": 11960 + }, + { + "epoch": 3.6305964486264988, + "grad_norm": 0.990105926990509, + "learning_rate": 3.950086058519794e-05, + "loss": 0.701, + "step": 11961 + }, + { + "epoch": 3.6308999848231904, + "grad_norm": 1.0605705976486206, + "learning_rate": 3.949579831932773e-05, + "loss": 0.7958, + "step": 11962 + }, + { + "epoch": 3.6312035210198816, + "grad_norm": 1.0559285879135132, + "learning_rate": 3.9490736053457526e-05, + "loss": 0.999, + "step": 11963 + }, + { + "epoch": 3.631507057216573, + "grad_norm": 0.9919759035110474, + "learning_rate": 3.948567378758732e-05, + "loss": 1.4333, + "step": 11964 + }, + { + "epoch": 3.6318105934132645, + "grad_norm": 0.7387545108795166, + "learning_rate": 3.948061152171712e-05, + "loss": 0.5685, + "step": 11965 + }, + { + "epoch": 3.632114129609956, + "grad_norm": 1.0928908586502075, + "learning_rate": 3.947554925584692e-05, + "loss": 0.9233, + "step": 11966 + }, + { + "epoch": 3.6324176658066474, + "grad_norm": 0.9104135036468506, + "learning_rate": 3.947048698997672e-05, + "loss": 0.7763, + "step": 11967 + }, + { + "epoch": 3.632721202003339, + "grad_norm": 1.047778606414795, + "learning_rate": 3.9465424724106514e-05, + "loss": 1.124, + "step": 11968 + }, + { + "epoch": 3.6330247382000302, + "grad_norm": 1.044195532798767, + "learning_rate": 3.946036245823631e-05, + "loss": 0.9499, + "step": 11969 + }, + { + "epoch": 3.633328274396722, + "grad_norm": 0.8726449012756348, + "learning_rate": 3.945530019236611e-05, + "loss": 0.6087, + "step": 11970 + }, + { + "epoch": 3.633631810593413, + "grad_norm": 0.9909285306930542, + "learning_rate": 3.9450237926495905e-05, + "loss": 1.1826, + "step": 11971 + }, + { + "epoch": 3.6339353467901048, + "grad_norm": 1.02983820438385, + "learning_rate": 3.94451756606257e-05, + "loss": 1.3371, + "step": 11972 + }, + { + "epoch": 3.6342388829867964, + "grad_norm": 1.1315581798553467, + "learning_rate": 3.9440113394755495e-05, + "loss": 1.1881, + "step": 11973 + }, + { + "epoch": 3.6345424191834876, + "grad_norm": 0.9457393884658813, + "learning_rate": 3.943505112888529e-05, + "loss": 0.9687, + "step": 11974 + }, + { + "epoch": 3.634845955380179, + "grad_norm": 0.8868625164031982, + "learning_rate": 3.942998886301509e-05, + "loss": 0.9391, + "step": 11975 + }, + { + "epoch": 3.6351494915768705, + "grad_norm": 1.1828168630599976, + "learning_rate": 3.9424926597144886e-05, + "loss": 0.6879, + "step": 11976 + }, + { + "epoch": 3.635453027773562, + "grad_norm": 0.8844999074935913, + "learning_rate": 3.941986433127468e-05, + "loss": 1.1787, + "step": 11977 + }, + { + "epoch": 3.6357565639702534, + "grad_norm": 0.9217205047607422, + "learning_rate": 3.9414802065404475e-05, + "loss": 0.966, + "step": 11978 + }, + { + "epoch": 3.636060100166945, + "grad_norm": 1.0602003335952759, + "learning_rate": 3.940973979953427e-05, + "loss": 1.3161, + "step": 11979 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 1.1117497682571411, + "learning_rate": 3.940467753366407e-05, + "loss": 0.9635, + "step": 11980 + }, + { + "epoch": 3.636667172560328, + "grad_norm": 1.2898273468017578, + "learning_rate": 3.9399615267793866e-05, + "loss": 1.0055, + "step": 11981 + }, + { + "epoch": 3.636970708757019, + "grad_norm": 1.095345139503479, + "learning_rate": 3.939455300192366e-05, + "loss": 0.5969, + "step": 11982 + }, + { + "epoch": 3.6372742449537108, + "grad_norm": 1.1055399179458618, + "learning_rate": 3.9389490736053456e-05, + "loss": 0.5385, + "step": 11983 + }, + { + "epoch": 3.6375777811504024, + "grad_norm": 1.2485231161117554, + "learning_rate": 3.938442847018326e-05, + "loss": 0.6341, + "step": 11984 + }, + { + "epoch": 3.6378813173470936, + "grad_norm": 0.9820212721824646, + "learning_rate": 3.937936620431305e-05, + "loss": 1.0154, + "step": 11985 + }, + { + "epoch": 3.638184853543785, + "grad_norm": 0.865306556224823, + "learning_rate": 3.937430393844285e-05, + "loss": 1.2112, + "step": 11986 + }, + { + "epoch": 3.6384883897404765, + "grad_norm": 1.1621384620666504, + "learning_rate": 3.936924167257264e-05, + "loss": 1.1591, + "step": 11987 + }, + { + "epoch": 3.638791925937168, + "grad_norm": 0.8271921873092651, + "learning_rate": 3.936417940670244e-05, + "loss": 1.4694, + "step": 11988 + }, + { + "epoch": 3.6390954621338594, + "grad_norm": 0.9038305282592773, + "learning_rate": 3.935911714083224e-05, + "loss": 1.42, + "step": 11989 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 1.1220879554748535, + "learning_rate": 3.935405487496204e-05, + "loss": 0.75, + "step": 11990 + }, + { + "epoch": 3.6397025345272422, + "grad_norm": 0.7142630219459534, + "learning_rate": 3.9348992609091835e-05, + "loss": 0.9634, + "step": 11991 + }, + { + "epoch": 3.640006070723934, + "grad_norm": 0.9535282254219055, + "learning_rate": 3.934393034322163e-05, + "loss": 1.1209, + "step": 11992 + }, + { + "epoch": 3.640309606920625, + "grad_norm": 1.1969962120056152, + "learning_rate": 3.9338868077351424e-05, + "loss": 0.7568, + "step": 11993 + }, + { + "epoch": 3.6406131431173168, + "grad_norm": 1.1002070903778076, + "learning_rate": 3.933380581148122e-05, + "loss": 0.7598, + "step": 11994 + }, + { + "epoch": 3.6409166793140084, + "grad_norm": 1.3393971920013428, + "learning_rate": 3.932874354561102e-05, + "loss": 1.0858, + "step": 11995 + }, + { + "epoch": 3.6412202155106996, + "grad_norm": 0.7879929542541504, + "learning_rate": 3.9323681279740816e-05, + "loss": 1.611, + "step": 11996 + }, + { + "epoch": 3.641523751707391, + "grad_norm": 1.2055152654647827, + "learning_rate": 3.931861901387061e-05, + "loss": 0.8166, + "step": 11997 + }, + { + "epoch": 3.6418272879040825, + "grad_norm": 0.8803278803825378, + "learning_rate": 3.9313556748000405e-05, + "loss": 1.5764, + "step": 11998 + }, + { + "epoch": 3.642130824100774, + "grad_norm": 0.8068442344665527, + "learning_rate": 3.930849448213021e-05, + "loss": 1.0555, + "step": 11999 + }, + { + "epoch": 3.6424343602974654, + "grad_norm": 1.0100542306900024, + "learning_rate": 3.930343221626e-05, + "loss": 0.6607, + "step": 12000 + }, + { + "epoch": 3.642737896494157, + "grad_norm": 1.178894281387329, + "learning_rate": 3.9298369950389796e-05, + "loss": 0.9682, + "step": 12001 + }, + { + "epoch": 3.6430414326908482, + "grad_norm": 1.0432075262069702, + "learning_rate": 3.929330768451959e-05, + "loss": 0.9449, + "step": 12002 + }, + { + "epoch": 3.64334496888754, + "grad_norm": 0.8466569185256958, + "learning_rate": 3.9288245418649386e-05, + "loss": 1.1329, + "step": 12003 + }, + { + "epoch": 3.643648505084231, + "grad_norm": 1.1064332723617554, + "learning_rate": 3.928318315277919e-05, + "loss": 1.0355, + "step": 12004 + }, + { + "epoch": 3.6439520412809228, + "grad_norm": 0.7345409989356995, + "learning_rate": 3.927812088690898e-05, + "loss": 1.4195, + "step": 12005 + }, + { + "epoch": 3.6442555774776144, + "grad_norm": 1.0027644634246826, + "learning_rate": 3.927305862103878e-05, + "loss": 0.9449, + "step": 12006 + }, + { + "epoch": 3.6445591136743056, + "grad_norm": 1.0467191934585571, + "learning_rate": 3.926799635516857e-05, + "loss": 0.5947, + "step": 12007 + }, + { + "epoch": 3.644862649870997, + "grad_norm": 0.8208259344100952, + "learning_rate": 3.9262934089298367e-05, + "loss": 1.122, + "step": 12008 + }, + { + "epoch": 3.6451661860676885, + "grad_norm": 1.0161082744598389, + "learning_rate": 3.925787182342817e-05, + "loss": 0.578, + "step": 12009 + }, + { + "epoch": 3.64546972226438, + "grad_norm": 0.9987685680389404, + "learning_rate": 3.925280955755796e-05, + "loss": 0.9358, + "step": 12010 + }, + { + "epoch": 3.6457732584610714, + "grad_norm": 0.7005483508110046, + "learning_rate": 3.924774729168776e-05, + "loss": 0.6287, + "step": 12011 + }, + { + "epoch": 3.646076794657763, + "grad_norm": 1.1996855735778809, + "learning_rate": 3.924268502581756e-05, + "loss": 0.9182, + "step": 12012 + }, + { + "epoch": 3.6463803308544542, + "grad_norm": 0.8942871689796448, + "learning_rate": 3.9237622759947354e-05, + "loss": 0.8962, + "step": 12013 + }, + { + "epoch": 3.646683867051146, + "grad_norm": 1.0719481706619263, + "learning_rate": 3.9232560494077156e-05, + "loss": 0.9931, + "step": 12014 + }, + { + "epoch": 3.646987403247837, + "grad_norm": 1.0616039037704468, + "learning_rate": 3.922749822820695e-05, + "loss": 1.0563, + "step": 12015 + }, + { + "epoch": 3.6472909394445288, + "grad_norm": 1.0456066131591797, + "learning_rate": 3.9222435962336745e-05, + "loss": 1.2178, + "step": 12016 + }, + { + "epoch": 3.6475944756412204, + "grad_norm": 0.9715830683708191, + "learning_rate": 3.921737369646654e-05, + "loss": 0.7868, + "step": 12017 + }, + { + "epoch": 3.6478980118379116, + "grad_norm": 1.2586824893951416, + "learning_rate": 3.9212311430596335e-05, + "loss": 1.0777, + "step": 12018 + }, + { + "epoch": 3.6482015480346033, + "grad_norm": 0.9855899214744568, + "learning_rate": 3.9207249164726136e-05, + "loss": 0.9687, + "step": 12019 + }, + { + "epoch": 3.6485050842312945, + "grad_norm": 0.894232988357544, + "learning_rate": 3.920218689885593e-05, + "loss": 1.3605, + "step": 12020 + }, + { + "epoch": 3.648808620427986, + "grad_norm": 1.3886747360229492, + "learning_rate": 3.9197124632985726e-05, + "loss": 0.9218, + "step": 12021 + }, + { + "epoch": 3.6491121566246774, + "grad_norm": 1.235087513923645, + "learning_rate": 3.919206236711552e-05, + "loss": 0.9951, + "step": 12022 + }, + { + "epoch": 3.649415692821369, + "grad_norm": 1.0058990716934204, + "learning_rate": 3.918700010124532e-05, + "loss": 0.9391, + "step": 12023 + }, + { + "epoch": 3.6497192290180607, + "grad_norm": 0.9259764552116394, + "learning_rate": 3.918193783537512e-05, + "loss": 0.3526, + "step": 12024 + }, + { + "epoch": 3.650022765214752, + "grad_norm": 0.9431399703025818, + "learning_rate": 3.917687556950491e-05, + "loss": 0.92, + "step": 12025 + }, + { + "epoch": 3.650326301411443, + "grad_norm": 0.9792678356170654, + "learning_rate": 3.917181330363471e-05, + "loss": 1.0912, + "step": 12026 + }, + { + "epoch": 3.6506298376081348, + "grad_norm": 1.249927043914795, + "learning_rate": 3.91667510377645e-05, + "loss": 0.7483, + "step": 12027 + }, + { + "epoch": 3.6509333738048264, + "grad_norm": 1.156718373298645, + "learning_rate": 3.91616887718943e-05, + "loss": 0.9721, + "step": 12028 + }, + { + "epoch": 3.6512369100015176, + "grad_norm": 0.9868149161338806, + "learning_rate": 3.91566265060241e-05, + "loss": 1.0434, + "step": 12029 + }, + { + "epoch": 3.6515404461982093, + "grad_norm": 1.315030813217163, + "learning_rate": 3.915156424015389e-05, + "loss": 1.0559, + "step": 12030 + }, + { + "epoch": 3.6518439823949005, + "grad_norm": 0.8619567155838013, + "learning_rate": 3.914650197428369e-05, + "loss": 1.0925, + "step": 12031 + }, + { + "epoch": 3.652147518591592, + "grad_norm": 0.7518938183784485, + "learning_rate": 3.914143970841348e-05, + "loss": 1.4304, + "step": 12032 + }, + { + "epoch": 3.6524510547882834, + "grad_norm": 0.7368621826171875, + "learning_rate": 3.9136377442543284e-05, + "loss": 1.0141, + "step": 12033 + }, + { + "epoch": 3.652754590984975, + "grad_norm": 0.6650719046592712, + "learning_rate": 3.9131315176673085e-05, + "loss": 1.1827, + "step": 12034 + }, + { + "epoch": 3.6530581271816667, + "grad_norm": 0.8382792472839355, + "learning_rate": 3.912625291080288e-05, + "loss": 0.4246, + "step": 12035 + }, + { + "epoch": 3.653361663378358, + "grad_norm": 1.1974725723266602, + "learning_rate": 3.9121190644932675e-05, + "loss": 0.7454, + "step": 12036 + }, + { + "epoch": 3.653665199575049, + "grad_norm": 0.8159098029136658, + "learning_rate": 3.911612837906247e-05, + "loss": 1.0952, + "step": 12037 + }, + { + "epoch": 3.6539687357717408, + "grad_norm": 1.1579129695892334, + "learning_rate": 3.911106611319227e-05, + "loss": 0.5428, + "step": 12038 + }, + { + "epoch": 3.6542722719684324, + "grad_norm": 0.8814316987991333, + "learning_rate": 3.9106003847322066e-05, + "loss": 0.7072, + "step": 12039 + }, + { + "epoch": 3.6545758081651236, + "grad_norm": 1.076669454574585, + "learning_rate": 3.910094158145186e-05, + "loss": 1.0992, + "step": 12040 + }, + { + "epoch": 3.6548793443618153, + "grad_norm": 1.192293405532837, + "learning_rate": 3.9095879315581656e-05, + "loss": 0.805, + "step": 12041 + }, + { + "epoch": 3.6551828805585065, + "grad_norm": 0.8559314608573914, + "learning_rate": 3.909081704971145e-05, + "loss": 0.8638, + "step": 12042 + }, + { + "epoch": 3.655486416755198, + "grad_norm": 1.040753722190857, + "learning_rate": 3.908575478384125e-05, + "loss": 1.2142, + "step": 12043 + }, + { + "epoch": 3.6557899529518894, + "grad_norm": 1.2287439107894897, + "learning_rate": 3.908069251797105e-05, + "loss": 0.8538, + "step": 12044 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 1.078728199005127, + "learning_rate": 3.907563025210084e-05, + "loss": 1.0492, + "step": 12045 + }, + { + "epoch": 3.6563970253452727, + "grad_norm": 1.1457713842391968, + "learning_rate": 3.9070567986230637e-05, + "loss": 1.0057, + "step": 12046 + }, + { + "epoch": 3.656700561541964, + "grad_norm": 0.7097097635269165, + "learning_rate": 3.906550572036043e-05, + "loss": 1.1541, + "step": 12047 + }, + { + "epoch": 3.657004097738655, + "grad_norm": 1.0083330869674683, + "learning_rate": 3.906044345449023e-05, + "loss": 1.0981, + "step": 12048 + }, + { + "epoch": 3.6573076339353467, + "grad_norm": 1.0340123176574707, + "learning_rate": 3.905538118862003e-05, + "loss": 0.7882, + "step": 12049 + }, + { + "epoch": 3.6576111701320384, + "grad_norm": 1.0132097005844116, + "learning_rate": 3.905031892274982e-05, + "loss": 0.9619, + "step": 12050 + }, + { + "epoch": 3.6579147063287296, + "grad_norm": 1.2179789543151855, + "learning_rate": 3.904525665687962e-05, + "loss": 0.899, + "step": 12051 + }, + { + "epoch": 3.6582182425254213, + "grad_norm": 1.1193042993545532, + "learning_rate": 3.904019439100942e-05, + "loss": 0.9397, + "step": 12052 + }, + { + "epoch": 3.6585217787221125, + "grad_norm": 0.969189465045929, + "learning_rate": 3.9035132125139214e-05, + "loss": 0.5195, + "step": 12053 + }, + { + "epoch": 3.658825314918804, + "grad_norm": 1.2395955324172974, + "learning_rate": 3.903006985926901e-05, + "loss": 1.0449, + "step": 12054 + }, + { + "epoch": 3.6591288511154954, + "grad_norm": 0.6515886187553406, + "learning_rate": 3.90250075933988e-05, + "loss": 1.4677, + "step": 12055 + }, + { + "epoch": 3.659432387312187, + "grad_norm": 1.0559724569320679, + "learning_rate": 3.9019945327528605e-05, + "loss": 0.9408, + "step": 12056 + }, + { + "epoch": 3.6597359235088787, + "grad_norm": 1.2250043153762817, + "learning_rate": 3.90148830616584e-05, + "loss": 1.0425, + "step": 12057 + }, + { + "epoch": 3.66003945970557, + "grad_norm": 1.0291029214859009, + "learning_rate": 3.90098207957882e-05, + "loss": 0.9, + "step": 12058 + }, + { + "epoch": 3.660342995902261, + "grad_norm": 1.0426442623138428, + "learning_rate": 3.9004758529917996e-05, + "loss": 1.16, + "step": 12059 + }, + { + "epoch": 3.6606465320989527, + "grad_norm": 1.1924620866775513, + "learning_rate": 3.899969626404779e-05, + "loss": 0.9697, + "step": 12060 + }, + { + "epoch": 3.6609500682956444, + "grad_norm": 0.8227230906486511, + "learning_rate": 3.8994633998177586e-05, + "loss": 0.6492, + "step": 12061 + }, + { + "epoch": 3.6612536044923356, + "grad_norm": 0.9322243332862854, + "learning_rate": 3.898957173230739e-05, + "loss": 1.1933, + "step": 12062 + }, + { + "epoch": 3.6615571406890273, + "grad_norm": 1.0611060857772827, + "learning_rate": 3.898450946643718e-05, + "loss": 0.9958, + "step": 12063 + }, + { + "epoch": 3.6618606768857185, + "grad_norm": 1.138100028038025, + "learning_rate": 3.897944720056698e-05, + "loss": 0.856, + "step": 12064 + }, + { + "epoch": 3.66216421308241, + "grad_norm": 1.0799232721328735, + "learning_rate": 3.897438493469677e-05, + "loss": 1.008, + "step": 12065 + }, + { + "epoch": 3.6624677492791013, + "grad_norm": 1.0480406284332275, + "learning_rate": 3.8969322668826566e-05, + "loss": 0.9377, + "step": 12066 + }, + { + "epoch": 3.662771285475793, + "grad_norm": 0.9635022282600403, + "learning_rate": 3.896426040295637e-05, + "loss": 1.2153, + "step": 12067 + }, + { + "epoch": 3.6630748216724847, + "grad_norm": 0.9031151533126831, + "learning_rate": 3.895919813708616e-05, + "loss": 0.731, + "step": 12068 + }, + { + "epoch": 3.663378357869176, + "grad_norm": 1.044087529182434, + "learning_rate": 3.895413587121596e-05, + "loss": 1.1184, + "step": 12069 + }, + { + "epoch": 3.663681894065867, + "grad_norm": 0.8980289101600647, + "learning_rate": 3.894907360534575e-05, + "loss": 0.5401, + "step": 12070 + }, + { + "epoch": 3.6639854302625587, + "grad_norm": 0.9841526746749878, + "learning_rate": 3.894401133947555e-05, + "loss": 1.1086, + "step": 12071 + }, + { + "epoch": 3.6642889664592504, + "grad_norm": 1.15212881565094, + "learning_rate": 3.893894907360535e-05, + "loss": 0.6916, + "step": 12072 + }, + { + "epoch": 3.6645925026559416, + "grad_norm": 1.2138458490371704, + "learning_rate": 3.8933886807735143e-05, + "loss": 1.0718, + "step": 12073 + }, + { + "epoch": 3.6648960388526333, + "grad_norm": 0.8834080696105957, + "learning_rate": 3.892882454186494e-05, + "loss": 1.4294, + "step": 12074 + }, + { + "epoch": 3.6651995750493245, + "grad_norm": 0.8670762777328491, + "learning_rate": 3.892376227599473e-05, + "loss": 0.8987, + "step": 12075 + }, + { + "epoch": 3.665503111246016, + "grad_norm": 1.1858683824539185, + "learning_rate": 3.8918700010124535e-05, + "loss": 1.0599, + "step": 12076 + }, + { + "epoch": 3.6658066474427073, + "grad_norm": 1.234961986541748, + "learning_rate": 3.891363774425433e-05, + "loss": 0.8586, + "step": 12077 + }, + { + "epoch": 3.666110183639399, + "grad_norm": 1.1605101823806763, + "learning_rate": 3.8908575478384124e-05, + "loss": 0.8594, + "step": 12078 + }, + { + "epoch": 3.6664137198360907, + "grad_norm": 1.1436798572540283, + "learning_rate": 3.8903513212513926e-05, + "loss": 0.9058, + "step": 12079 + }, + { + "epoch": 3.666717256032782, + "grad_norm": 1.074713945388794, + "learning_rate": 3.889845094664372e-05, + "loss": 1.1255, + "step": 12080 + }, + { + "epoch": 3.6670207922294735, + "grad_norm": 1.0842182636260986, + "learning_rate": 3.8893388680773515e-05, + "loss": 0.8351, + "step": 12081 + }, + { + "epoch": 3.6673243284261647, + "grad_norm": 1.0366357564926147, + "learning_rate": 3.888832641490332e-05, + "loss": 0.7222, + "step": 12082 + }, + { + "epoch": 3.6676278646228564, + "grad_norm": 1.1431342363357544, + "learning_rate": 3.888326414903311e-05, + "loss": 0.591, + "step": 12083 + }, + { + "epoch": 3.6679314008195476, + "grad_norm": 0.8151815533638, + "learning_rate": 3.8878201883162907e-05, + "loss": 0.9629, + "step": 12084 + }, + { + "epoch": 3.6682349370162393, + "grad_norm": 1.0040082931518555, + "learning_rate": 3.88731396172927e-05, + "loss": 0.8697, + "step": 12085 + }, + { + "epoch": 3.668538473212931, + "grad_norm": 0.9840987324714661, + "learning_rate": 3.8868077351422496e-05, + "loss": 0.9602, + "step": 12086 + }, + { + "epoch": 3.668842009409622, + "grad_norm": 1.0550339221954346, + "learning_rate": 3.88630150855523e-05, + "loss": 1.0828, + "step": 12087 + }, + { + "epoch": 3.6691455456063133, + "grad_norm": 1.050746202468872, + "learning_rate": 3.885795281968209e-05, + "loss": 0.9869, + "step": 12088 + }, + { + "epoch": 3.669449081803005, + "grad_norm": 1.0241347551345825, + "learning_rate": 3.885289055381189e-05, + "loss": 0.6317, + "step": 12089 + }, + { + "epoch": 3.6697526179996967, + "grad_norm": 1.1782798767089844, + "learning_rate": 3.884782828794168e-05, + "loss": 0.919, + "step": 12090 + }, + { + "epoch": 3.670056154196388, + "grad_norm": 1.0204185247421265, + "learning_rate": 3.8842766022071484e-05, + "loss": 0.9816, + "step": 12091 + }, + { + "epoch": 3.6703596903930795, + "grad_norm": 0.9425390958786011, + "learning_rate": 3.883770375620128e-05, + "loss": 1.1783, + "step": 12092 + }, + { + "epoch": 3.6706632265897707, + "grad_norm": 1.1368494033813477, + "learning_rate": 3.883264149033107e-05, + "loss": 0.7561, + "step": 12093 + }, + { + "epoch": 3.6709667627864624, + "grad_norm": 0.8963260054588318, + "learning_rate": 3.882757922446087e-05, + "loss": 1.2093, + "step": 12094 + }, + { + "epoch": 3.6712702989831536, + "grad_norm": 1.1174767017364502, + "learning_rate": 3.882251695859066e-05, + "loss": 0.8852, + "step": 12095 + }, + { + "epoch": 3.6715738351798453, + "grad_norm": 1.040208101272583, + "learning_rate": 3.8817454692720464e-05, + "loss": 0.8368, + "step": 12096 + }, + { + "epoch": 3.671877371376537, + "grad_norm": 1.0897774696350098, + "learning_rate": 3.881239242685026e-05, + "loss": 1.2033, + "step": 12097 + }, + { + "epoch": 3.672180907573228, + "grad_norm": 0.9678447842597961, + "learning_rate": 3.8807330160980054e-05, + "loss": 1.3047, + "step": 12098 + }, + { + "epoch": 3.6724844437699193, + "grad_norm": 1.08724045753479, + "learning_rate": 3.880226789510985e-05, + "loss": 1.0252, + "step": 12099 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 1.1432311534881592, + "learning_rate": 3.8797205629239644e-05, + "loss": 0.4971, + "step": 12100 + }, + { + "epoch": 3.6730915161633026, + "grad_norm": 1.072237491607666, + "learning_rate": 3.879214336336945e-05, + "loss": 0.9545, + "step": 12101 + }, + { + "epoch": 3.673395052359994, + "grad_norm": 1.096682071685791, + "learning_rate": 3.878708109749925e-05, + "loss": 0.8364, + "step": 12102 + }, + { + "epoch": 3.6736985885566855, + "grad_norm": 1.1902248859405518, + "learning_rate": 3.878201883162904e-05, + "loss": 0.9352, + "step": 12103 + }, + { + "epoch": 3.6740021247533767, + "grad_norm": 1.1655784845352173, + "learning_rate": 3.8776956565758836e-05, + "loss": 0.3943, + "step": 12104 + }, + { + "epoch": 3.6743056609500684, + "grad_norm": 1.2456717491149902, + "learning_rate": 3.877189429988863e-05, + "loss": 0.6998, + "step": 12105 + }, + { + "epoch": 3.6746091971467596, + "grad_norm": 1.1448798179626465, + "learning_rate": 3.876683203401843e-05, + "loss": 0.9647, + "step": 12106 + }, + { + "epoch": 3.6749127333434513, + "grad_norm": 0.9256434440612793, + "learning_rate": 3.876176976814823e-05, + "loss": 0.7016, + "step": 12107 + }, + { + "epoch": 3.675216269540143, + "grad_norm": 0.9627837538719177, + "learning_rate": 3.875670750227802e-05, + "loss": 1.3, + "step": 12108 + }, + { + "epoch": 3.675519805736834, + "grad_norm": 1.2234262228012085, + "learning_rate": 3.875164523640782e-05, + "loss": 1.0206, + "step": 12109 + }, + { + "epoch": 3.6758233419335253, + "grad_norm": 1.18487548828125, + "learning_rate": 3.874658297053761e-05, + "loss": 0.714, + "step": 12110 + }, + { + "epoch": 3.676126878130217, + "grad_norm": 0.8237455487251282, + "learning_rate": 3.8741520704667413e-05, + "loss": 0.7367, + "step": 12111 + }, + { + "epoch": 3.6764304143269086, + "grad_norm": 1.0138734579086304, + "learning_rate": 3.873645843879721e-05, + "loss": 1.2628, + "step": 12112 + }, + { + "epoch": 3.6767339505236, + "grad_norm": 1.3141272068023682, + "learning_rate": 3.8731396172927e-05, + "loss": 0.5804, + "step": 12113 + }, + { + "epoch": 3.6770374867202915, + "grad_norm": 1.2245581150054932, + "learning_rate": 3.87263339070568e-05, + "loss": 1.0306, + "step": 12114 + }, + { + "epoch": 3.6773410229169827, + "grad_norm": 1.1122380495071411, + "learning_rate": 3.87212716411866e-05, + "loss": 1.014, + "step": 12115 + }, + { + "epoch": 3.6776445591136744, + "grad_norm": 1.285766839981079, + "learning_rate": 3.8716209375316394e-05, + "loss": 0.8452, + "step": 12116 + }, + { + "epoch": 3.6779480953103656, + "grad_norm": 0.9923285841941833, + "learning_rate": 3.871114710944619e-05, + "loss": 1.0973, + "step": 12117 + }, + { + "epoch": 3.6782516315070573, + "grad_norm": 0.8689481616020203, + "learning_rate": 3.8706084843575984e-05, + "loss": 1.2463, + "step": 12118 + }, + { + "epoch": 3.678555167703749, + "grad_norm": 1.0687531232833862, + "learning_rate": 3.870102257770578e-05, + "loss": 0.9177, + "step": 12119 + }, + { + "epoch": 3.67885870390044, + "grad_norm": 1.0274349451065063, + "learning_rate": 3.869596031183558e-05, + "loss": 1.0718, + "step": 12120 + }, + { + "epoch": 3.6791622400971313, + "grad_norm": 0.8803656697273254, + "learning_rate": 3.8690898045965375e-05, + "loss": 0.8178, + "step": 12121 + }, + { + "epoch": 3.679465776293823, + "grad_norm": 1.0155199766159058, + "learning_rate": 3.868583578009517e-05, + "loss": 0.9875, + "step": 12122 + }, + { + "epoch": 3.6797693124905146, + "grad_norm": 1.00933039188385, + "learning_rate": 3.868077351422497e-05, + "loss": 1.0408, + "step": 12123 + }, + { + "epoch": 3.680072848687206, + "grad_norm": 0.9493554830551147, + "learning_rate": 3.8675711248354766e-05, + "loss": 0.9832, + "step": 12124 + }, + { + "epoch": 3.6803763848838975, + "grad_norm": 0.9082612991333008, + "learning_rate": 3.867064898248456e-05, + "loss": 1.2616, + "step": 12125 + }, + { + "epoch": 3.6806799210805887, + "grad_norm": 0.9704240560531616, + "learning_rate": 3.866558671661436e-05, + "loss": 1.165, + "step": 12126 + }, + { + "epoch": 3.6809834572772804, + "grad_norm": 1.0881056785583496, + "learning_rate": 3.866052445074416e-05, + "loss": 1.0524, + "step": 12127 + }, + { + "epoch": 3.6812869934739716, + "grad_norm": 1.1638364791870117, + "learning_rate": 3.865546218487395e-05, + "loss": 0.5493, + "step": 12128 + }, + { + "epoch": 3.6815905296706632, + "grad_norm": 0.9481220245361328, + "learning_rate": 3.865039991900375e-05, + "loss": 0.6109, + "step": 12129 + }, + { + "epoch": 3.681894065867355, + "grad_norm": 1.0689703226089478, + "learning_rate": 3.864533765313355e-05, + "loss": 0.9964, + "step": 12130 + }, + { + "epoch": 3.682197602064046, + "grad_norm": 0.9507383704185486, + "learning_rate": 3.864027538726334e-05, + "loss": 0.6419, + "step": 12131 + }, + { + "epoch": 3.6825011382607373, + "grad_norm": 1.1811165809631348, + "learning_rate": 3.863521312139314e-05, + "loss": 0.7666, + "step": 12132 + }, + { + "epoch": 3.682804674457429, + "grad_norm": 1.3146898746490479, + "learning_rate": 3.863015085552293e-05, + "loss": 0.8806, + "step": 12133 + }, + { + "epoch": 3.6831082106541206, + "grad_norm": 0.831794798374176, + "learning_rate": 3.862508858965273e-05, + "loss": 0.9983, + "step": 12134 + }, + { + "epoch": 3.683411746850812, + "grad_norm": 1.1141377687454224, + "learning_rate": 3.862002632378253e-05, + "loss": 1.0834, + "step": 12135 + }, + { + "epoch": 3.6837152830475035, + "grad_norm": 1.2772284746170044, + "learning_rate": 3.8614964057912324e-05, + "loss": 0.7985, + "step": 12136 + }, + { + "epoch": 3.6840188192441947, + "grad_norm": 0.9757315516471863, + "learning_rate": 3.860990179204212e-05, + "loss": 1.2705, + "step": 12137 + }, + { + "epoch": 3.6843223554408864, + "grad_norm": 0.7473105788230896, + "learning_rate": 3.8604839526171913e-05, + "loss": 1.5949, + "step": 12138 + }, + { + "epoch": 3.6846258916375776, + "grad_norm": 0.8062529563903809, + "learning_rate": 3.859977726030171e-05, + "loss": 0.8017, + "step": 12139 + }, + { + "epoch": 3.6849294278342692, + "grad_norm": 1.0470958948135376, + "learning_rate": 3.859471499443151e-05, + "loss": 0.4568, + "step": 12140 + }, + { + "epoch": 3.685232964030961, + "grad_norm": 0.8670344352722168, + "learning_rate": 3.8589652728561305e-05, + "loss": 1.0303, + "step": 12141 + }, + { + "epoch": 3.685536500227652, + "grad_norm": 0.857889711856842, + "learning_rate": 3.85845904626911e-05, + "loss": 0.9689, + "step": 12142 + }, + { + "epoch": 3.6858400364243438, + "grad_norm": 0.9390396475791931, + "learning_rate": 3.8579528196820894e-05, + "loss": 1.1482, + "step": 12143 + }, + { + "epoch": 3.686143572621035, + "grad_norm": 1.2261883020401, + "learning_rate": 3.8574465930950696e-05, + "loss": 1.2344, + "step": 12144 + }, + { + "epoch": 3.6864471088177266, + "grad_norm": 1.0962419509887695, + "learning_rate": 3.85694036650805e-05, + "loss": 0.9081, + "step": 12145 + }, + { + "epoch": 3.686750645014418, + "grad_norm": 0.6134799122810364, + "learning_rate": 3.856434139921029e-05, + "loss": 0.6927, + "step": 12146 + }, + { + "epoch": 3.6870541812111095, + "grad_norm": 0.982342541217804, + "learning_rate": 3.855927913334009e-05, + "loss": 0.7295, + "step": 12147 + }, + { + "epoch": 3.6873577174078007, + "grad_norm": 1.2685867547988892, + "learning_rate": 3.855421686746988e-05, + "loss": 0.9338, + "step": 12148 + }, + { + "epoch": 3.6876612536044924, + "grad_norm": 1.2016700506210327, + "learning_rate": 3.8549154601599677e-05, + "loss": 1.0333, + "step": 12149 + }, + { + "epoch": 3.6879647898011836, + "grad_norm": 0.9865316152572632, + "learning_rate": 3.854409233572948e-05, + "loss": 1.1375, + "step": 12150 + }, + { + "epoch": 3.6882683259978752, + "grad_norm": 1.0826231241226196, + "learning_rate": 3.853903006985927e-05, + "loss": 1.0688, + "step": 12151 + }, + { + "epoch": 3.688571862194567, + "grad_norm": 0.8892161846160889, + "learning_rate": 3.853396780398907e-05, + "loss": 1.2882, + "step": 12152 + }, + { + "epoch": 3.688875398391258, + "grad_norm": 1.1065289974212646, + "learning_rate": 3.852890553811886e-05, + "loss": 0.8383, + "step": 12153 + }, + { + "epoch": 3.6891789345879498, + "grad_norm": 1.0190792083740234, + "learning_rate": 3.8523843272248664e-05, + "loss": 0.7031, + "step": 12154 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 1.1541862487792969, + "learning_rate": 3.851878100637846e-05, + "loss": 1.1368, + "step": 12155 + }, + { + "epoch": 3.6897860069813326, + "grad_norm": 0.8970853686332703, + "learning_rate": 3.8513718740508254e-05, + "loss": 1.2694, + "step": 12156 + }, + { + "epoch": 3.690089543178024, + "grad_norm": 0.739221453666687, + "learning_rate": 3.850865647463805e-05, + "loss": 0.9557, + "step": 12157 + }, + { + "epoch": 3.6903930793747155, + "grad_norm": 1.1054959297180176, + "learning_rate": 3.850359420876784e-05, + "loss": 0.8193, + "step": 12158 + }, + { + "epoch": 3.690696615571407, + "grad_norm": 0.9683334827423096, + "learning_rate": 3.8498531942897645e-05, + "loss": 1.3044, + "step": 12159 + }, + { + "epoch": 3.6910001517680984, + "grad_norm": 0.8212486505508423, + "learning_rate": 3.849346967702744e-05, + "loss": 0.6349, + "step": 12160 + }, + { + "epoch": 3.6913036879647896, + "grad_norm": 0.9672641158103943, + "learning_rate": 3.8488407411157234e-05, + "loss": 0.6538, + "step": 12161 + }, + { + "epoch": 3.6916072241614812, + "grad_norm": 0.981306254863739, + "learning_rate": 3.848334514528703e-05, + "loss": 1.0245, + "step": 12162 + }, + { + "epoch": 3.691910760358173, + "grad_norm": 1.3195877075195312, + "learning_rate": 3.8478282879416824e-05, + "loss": 1.306, + "step": 12163 + }, + { + "epoch": 3.692214296554864, + "grad_norm": 0.9816535115242004, + "learning_rate": 3.8473220613546626e-05, + "loss": 0.725, + "step": 12164 + }, + { + "epoch": 3.6925178327515558, + "grad_norm": 1.2062028646469116, + "learning_rate": 3.846815834767642e-05, + "loss": 0.8131, + "step": 12165 + }, + { + "epoch": 3.692821368948247, + "grad_norm": 1.2611056566238403, + "learning_rate": 3.8463096081806215e-05, + "loss": 0.57, + "step": 12166 + }, + { + "epoch": 3.6931249051449386, + "grad_norm": 0.9569219946861267, + "learning_rate": 3.845803381593601e-05, + "loss": 1.3357, + "step": 12167 + }, + { + "epoch": 3.69342844134163, + "grad_norm": 1.1622668504714966, + "learning_rate": 3.845297155006581e-05, + "loss": 0.8097, + "step": 12168 + }, + { + "epoch": 3.6937319775383215, + "grad_norm": 0.8993034958839417, + "learning_rate": 3.844790928419561e-05, + "loss": 1.3427, + "step": 12169 + }, + { + "epoch": 3.694035513735013, + "grad_norm": 0.5901782512664795, + "learning_rate": 3.844284701832541e-05, + "loss": 0.8111, + "step": 12170 + }, + { + "epoch": 3.6943390499317044, + "grad_norm": 1.0208097696304321, + "learning_rate": 3.84377847524552e-05, + "loss": 1.0796, + "step": 12171 + }, + { + "epoch": 3.6946425861283956, + "grad_norm": 1.1238336563110352, + "learning_rate": 3.8432722486585e-05, + "loss": 0.6026, + "step": 12172 + }, + { + "epoch": 3.6949461223250872, + "grad_norm": 0.7845574021339417, + "learning_rate": 3.842766022071479e-05, + "loss": 0.7885, + "step": 12173 + }, + { + "epoch": 3.695249658521779, + "grad_norm": 0.987266480922699, + "learning_rate": 3.8422597954844594e-05, + "loss": 1.3136, + "step": 12174 + }, + { + "epoch": 3.69555319471847, + "grad_norm": 0.9419277310371399, + "learning_rate": 3.841753568897439e-05, + "loss": 0.7413, + "step": 12175 + }, + { + "epoch": 3.6958567309151618, + "grad_norm": 1.0948914289474487, + "learning_rate": 3.8412473423104183e-05, + "loss": 0.7056, + "step": 12176 + }, + { + "epoch": 3.696160267111853, + "grad_norm": 1.013705849647522, + "learning_rate": 3.840741115723398e-05, + "loss": 1.0751, + "step": 12177 + }, + { + "epoch": 3.6964638033085446, + "grad_norm": 1.0518913269042969, + "learning_rate": 3.840234889136377e-05, + "loss": 0.7971, + "step": 12178 + }, + { + "epoch": 3.696767339505236, + "grad_norm": 0.9736500382423401, + "learning_rate": 3.8397286625493575e-05, + "loss": 1.2676, + "step": 12179 + }, + { + "epoch": 3.6970708757019275, + "grad_norm": 0.9486212134361267, + "learning_rate": 3.839222435962337e-05, + "loss": 1.0547, + "step": 12180 + }, + { + "epoch": 3.697374411898619, + "grad_norm": 0.7350013256072998, + "learning_rate": 3.8387162093753164e-05, + "loss": 0.842, + "step": 12181 + }, + { + "epoch": 3.6976779480953104, + "grad_norm": 0.9961176514625549, + "learning_rate": 3.838209982788296e-05, + "loss": 0.8944, + "step": 12182 + }, + { + "epoch": 3.6979814842920016, + "grad_norm": 0.9222147464752197, + "learning_rate": 3.837703756201276e-05, + "loss": 1.2384, + "step": 12183 + }, + { + "epoch": 3.6982850204886932, + "grad_norm": 1.0689314603805542, + "learning_rate": 3.8371975296142555e-05, + "loss": 0.9423, + "step": 12184 + }, + { + "epoch": 3.698588556685385, + "grad_norm": 0.8910040855407715, + "learning_rate": 3.836691303027235e-05, + "loss": 1.2652, + "step": 12185 + }, + { + "epoch": 3.698892092882076, + "grad_norm": 1.203522801399231, + "learning_rate": 3.8361850764402145e-05, + "loss": 0.426, + "step": 12186 + }, + { + "epoch": 3.6991956290787678, + "grad_norm": 1.141925573348999, + "learning_rate": 3.835678849853194e-05, + "loss": 1.0067, + "step": 12187 + }, + { + "epoch": 3.699499165275459, + "grad_norm": 1.1666052341461182, + "learning_rate": 3.835172623266174e-05, + "loss": 1.1564, + "step": 12188 + }, + { + "epoch": 3.6998027014721506, + "grad_norm": 1.308785319328308, + "learning_rate": 3.8346663966791536e-05, + "loss": 1.2319, + "step": 12189 + }, + { + "epoch": 3.700106237668842, + "grad_norm": 0.9423047304153442, + "learning_rate": 3.834160170092134e-05, + "loss": 1.546, + "step": 12190 + }, + { + "epoch": 3.7004097738655335, + "grad_norm": 1.1595866680145264, + "learning_rate": 3.833653943505113e-05, + "loss": 1.026, + "step": 12191 + }, + { + "epoch": 3.700713310062225, + "grad_norm": 0.9828789830207825, + "learning_rate": 3.833147716918093e-05, + "loss": 0.643, + "step": 12192 + }, + { + "epoch": 3.7010168462589164, + "grad_norm": 0.9516759514808655, + "learning_rate": 3.832641490331073e-05, + "loss": 0.7391, + "step": 12193 + }, + { + "epoch": 3.7013203824556076, + "grad_norm": 0.9273096919059753, + "learning_rate": 3.8321352637440524e-05, + "loss": 1.0703, + "step": 12194 + }, + { + "epoch": 3.7016239186522992, + "grad_norm": 1.160027027130127, + "learning_rate": 3.831629037157032e-05, + "loss": 1.0675, + "step": 12195 + }, + { + "epoch": 3.701927454848991, + "grad_norm": 0.9412860870361328, + "learning_rate": 3.831122810570011e-05, + "loss": 0.5968, + "step": 12196 + }, + { + "epoch": 3.702230991045682, + "grad_norm": 1.207733154296875, + "learning_rate": 3.830616583982991e-05, + "loss": 1.0296, + "step": 12197 + }, + { + "epoch": 3.7025345272423738, + "grad_norm": 0.8901572823524475, + "learning_rate": 3.830110357395971e-05, + "loss": 1.1567, + "step": 12198 + }, + { + "epoch": 3.702838063439065, + "grad_norm": 0.9975547194480896, + "learning_rate": 3.8296041308089504e-05, + "loss": 0.6348, + "step": 12199 + }, + { + "epoch": 3.7031415996357566, + "grad_norm": 1.125101923942566, + "learning_rate": 3.82909790422193e-05, + "loss": 1.0591, + "step": 12200 + }, + { + "epoch": 3.703445135832448, + "grad_norm": 1.107262134552002, + "learning_rate": 3.8285916776349094e-05, + "loss": 1.2022, + "step": 12201 + }, + { + "epoch": 3.7037486720291395, + "grad_norm": 1.0285307168960571, + "learning_rate": 3.828085451047889e-05, + "loss": 1.0454, + "step": 12202 + }, + { + "epoch": 3.704052208225831, + "grad_norm": 0.971775472164154, + "learning_rate": 3.827579224460869e-05, + "loss": 1.3207, + "step": 12203 + }, + { + "epoch": 3.7043557444225224, + "grad_norm": 0.9013938903808594, + "learning_rate": 3.8270729978738485e-05, + "loss": 1.1316, + "step": 12204 + }, + { + "epoch": 3.7046592806192136, + "grad_norm": 1.1737385988235474, + "learning_rate": 3.826566771286828e-05, + "loss": 0.933, + "step": 12205 + }, + { + "epoch": 3.7049628168159052, + "grad_norm": 1.0344955921173096, + "learning_rate": 3.8260605446998075e-05, + "loss": 1.1893, + "step": 12206 + }, + { + "epoch": 3.705266353012597, + "grad_norm": 1.047410011291504, + "learning_rate": 3.8255543181127876e-05, + "loss": 1.2353, + "step": 12207 + }, + { + "epoch": 3.705569889209288, + "grad_norm": 0.966019332408905, + "learning_rate": 3.825048091525767e-05, + "loss": 1.4408, + "step": 12208 + }, + { + "epoch": 3.7058734254059797, + "grad_norm": 1.0156018733978271, + "learning_rate": 3.8245418649387466e-05, + "loss": 1.2875, + "step": 12209 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 1.043990135192871, + "learning_rate": 3.824035638351726e-05, + "loss": 0.7414, + "step": 12210 + }, + { + "epoch": 3.7064804977993626, + "grad_norm": 0.7362629771232605, + "learning_rate": 3.8235294117647055e-05, + "loss": 1.5277, + "step": 12211 + }, + { + "epoch": 3.706784033996054, + "grad_norm": 1.0909278392791748, + "learning_rate": 3.823023185177686e-05, + "loss": 0.9033, + "step": 12212 + }, + { + "epoch": 3.7070875701927455, + "grad_norm": 0.905019223690033, + "learning_rate": 3.822516958590666e-05, + "loss": 1.3159, + "step": 12213 + }, + { + "epoch": 3.707391106389437, + "grad_norm": 1.1853880882263184, + "learning_rate": 3.8220107320036453e-05, + "loss": 1.0329, + "step": 12214 + }, + { + "epoch": 3.7076946425861284, + "grad_norm": 0.8706358671188354, + "learning_rate": 3.821504505416625e-05, + "loss": 1.1769, + "step": 12215 + }, + { + "epoch": 3.70799817878282, + "grad_norm": 0.9755493998527527, + "learning_rate": 3.820998278829604e-05, + "loss": 0.814, + "step": 12216 + }, + { + "epoch": 3.708301714979511, + "grad_norm": 1.234582781791687, + "learning_rate": 3.820492052242584e-05, + "loss": 0.8302, + "step": 12217 + }, + { + "epoch": 3.708605251176203, + "grad_norm": 1.1053121089935303, + "learning_rate": 3.819985825655564e-05, + "loss": 0.9119, + "step": 12218 + }, + { + "epoch": 3.708908787372894, + "grad_norm": 0.895339846611023, + "learning_rate": 3.8194795990685434e-05, + "loss": 0.8842, + "step": 12219 + }, + { + "epoch": 3.7092123235695857, + "grad_norm": 1.3201688528060913, + "learning_rate": 3.818973372481523e-05, + "loss": 0.8321, + "step": 12220 + }, + { + "epoch": 3.7095158597662774, + "grad_norm": 0.8213345408439636, + "learning_rate": 3.8184671458945024e-05, + "loss": 1.0081, + "step": 12221 + }, + { + "epoch": 3.7098193959629686, + "grad_norm": 0.7760623097419739, + "learning_rate": 3.8179609193074825e-05, + "loss": 0.87, + "step": 12222 + }, + { + "epoch": 3.71012293215966, + "grad_norm": 0.807321310043335, + "learning_rate": 3.817454692720462e-05, + "loss": 0.8945, + "step": 12223 + }, + { + "epoch": 3.7104264683563515, + "grad_norm": 1.0672060251235962, + "learning_rate": 3.8169484661334415e-05, + "loss": 0.9456, + "step": 12224 + }, + { + "epoch": 3.710730004553043, + "grad_norm": 1.17915940284729, + "learning_rate": 3.816442239546421e-05, + "loss": 0.955, + "step": 12225 + }, + { + "epoch": 3.7110335407497343, + "grad_norm": 0.842879593372345, + "learning_rate": 3.8159360129594004e-05, + "loss": 1.2119, + "step": 12226 + }, + { + "epoch": 3.711337076946426, + "grad_norm": 1.1195244789123535, + "learning_rate": 3.8154297863723806e-05, + "loss": 1.2887, + "step": 12227 + }, + { + "epoch": 3.711640613143117, + "grad_norm": 1.2647395133972168, + "learning_rate": 3.81492355978536e-05, + "loss": 1.0693, + "step": 12228 + }, + { + "epoch": 3.711944149339809, + "grad_norm": 0.9714828133583069, + "learning_rate": 3.8144173331983396e-05, + "loss": 1.2092, + "step": 12229 + }, + { + "epoch": 3.7122476855365, + "grad_norm": 1.062777042388916, + "learning_rate": 3.813911106611319e-05, + "loss": 1.2593, + "step": 12230 + }, + { + "epoch": 3.7125512217331917, + "grad_norm": 1.1548885107040405, + "learning_rate": 3.8134048800242985e-05, + "loss": 1.005, + "step": 12231 + }, + { + "epoch": 3.7128547579298834, + "grad_norm": 1.0412431955337524, + "learning_rate": 3.812898653437279e-05, + "loss": 1.2472, + "step": 12232 + }, + { + "epoch": 3.7131582941265746, + "grad_norm": 0.9406505227088928, + "learning_rate": 3.812392426850258e-05, + "loss": 0.9713, + "step": 12233 + }, + { + "epoch": 3.713461830323266, + "grad_norm": 0.9440913796424866, + "learning_rate": 3.811886200263238e-05, + "loss": 1.1702, + "step": 12234 + }, + { + "epoch": 3.7137653665199575, + "grad_norm": 1.0048294067382812, + "learning_rate": 3.811379973676218e-05, + "loss": 1.1419, + "step": 12235 + }, + { + "epoch": 3.714068902716649, + "grad_norm": 1.0879192352294922, + "learning_rate": 3.810873747089197e-05, + "loss": 1.0218, + "step": 12236 + }, + { + "epoch": 3.7143724389133403, + "grad_norm": 0.9541288614273071, + "learning_rate": 3.8103675205021774e-05, + "loss": 0.9656, + "step": 12237 + }, + { + "epoch": 3.714675975110032, + "grad_norm": 1.0213394165039062, + "learning_rate": 3.809861293915157e-05, + "loss": 1.1674, + "step": 12238 + }, + { + "epoch": 3.714979511306723, + "grad_norm": 0.9564364552497864, + "learning_rate": 3.8093550673281364e-05, + "loss": 0.5582, + "step": 12239 + }, + { + "epoch": 3.715283047503415, + "grad_norm": 1.0670713186264038, + "learning_rate": 3.808848840741116e-05, + "loss": 1.1753, + "step": 12240 + }, + { + "epoch": 3.715586583700106, + "grad_norm": 0.9610856771469116, + "learning_rate": 3.8083426141540954e-05, + "loss": 0.9492, + "step": 12241 + }, + { + "epoch": 3.7158901198967977, + "grad_norm": 0.7419840693473816, + "learning_rate": 3.8078363875670755e-05, + "loss": 0.888, + "step": 12242 + }, + { + "epoch": 3.7161936560934894, + "grad_norm": 1.3475884199142456, + "learning_rate": 3.807330160980055e-05, + "loss": 0.7909, + "step": 12243 + }, + { + "epoch": 3.7164971922901806, + "grad_norm": 0.9927060604095459, + "learning_rate": 3.8068239343930345e-05, + "loss": 1.2154, + "step": 12244 + }, + { + "epoch": 3.716800728486872, + "grad_norm": 1.2149968147277832, + "learning_rate": 3.806317707806014e-05, + "loss": 0.8462, + "step": 12245 + }, + { + "epoch": 3.7171042646835635, + "grad_norm": 0.960020899772644, + "learning_rate": 3.805811481218994e-05, + "loss": 0.5283, + "step": 12246 + }, + { + "epoch": 3.717407800880255, + "grad_norm": 1.319292426109314, + "learning_rate": 3.8053052546319736e-05, + "loss": 0.7028, + "step": 12247 + }, + { + "epoch": 3.7177113370769463, + "grad_norm": 1.1760718822479248, + "learning_rate": 3.804799028044953e-05, + "loss": 0.6196, + "step": 12248 + }, + { + "epoch": 3.718014873273638, + "grad_norm": 1.332132339477539, + "learning_rate": 3.8042928014579325e-05, + "loss": 0.9295, + "step": 12249 + }, + { + "epoch": 3.718318409470329, + "grad_norm": 1.3408931493759155, + "learning_rate": 3.803786574870912e-05, + "loss": 0.8238, + "step": 12250 + }, + { + "epoch": 3.718621945667021, + "grad_norm": 1.3683499097824097, + "learning_rate": 3.803280348283892e-05, + "loss": 0.8292, + "step": 12251 + }, + { + "epoch": 3.718925481863712, + "grad_norm": 1.0252432823181152, + "learning_rate": 3.8027741216968717e-05, + "loss": 0.8108, + "step": 12252 + }, + { + "epoch": 3.7192290180604037, + "grad_norm": 1.0486154556274414, + "learning_rate": 3.802267895109851e-05, + "loss": 0.9567, + "step": 12253 + }, + { + "epoch": 3.7195325542570954, + "grad_norm": 1.3113080263137817, + "learning_rate": 3.8017616685228306e-05, + "loss": 0.895, + "step": 12254 + }, + { + "epoch": 3.7198360904537866, + "grad_norm": 1.1616406440734863, + "learning_rate": 3.80125544193581e-05, + "loss": 0.9117, + "step": 12255 + }, + { + "epoch": 3.720139626650478, + "grad_norm": 1.069471836090088, + "learning_rate": 3.80074921534879e-05, + "loss": 1.1821, + "step": 12256 + }, + { + "epoch": 3.7204431628471695, + "grad_norm": 1.070489525794983, + "learning_rate": 3.8002429887617704e-05, + "loss": 1.2504, + "step": 12257 + }, + { + "epoch": 3.720746699043861, + "grad_norm": 0.8889525532722473, + "learning_rate": 3.79973676217475e-05, + "loss": 1.0755, + "step": 12258 + }, + { + "epoch": 3.7210502352405523, + "grad_norm": 1.0680694580078125, + "learning_rate": 3.7992305355877294e-05, + "loss": 0.8431, + "step": 12259 + }, + { + "epoch": 3.721353771437244, + "grad_norm": 1.1748120784759521, + "learning_rate": 3.798724309000709e-05, + "loss": 0.9165, + "step": 12260 + }, + { + "epoch": 3.721657307633935, + "grad_norm": 1.1314773559570312, + "learning_rate": 3.798218082413689e-05, + "loss": 0.9413, + "step": 12261 + }, + { + "epoch": 3.721960843830627, + "grad_norm": 1.0644497871398926, + "learning_rate": 3.7977118558266685e-05, + "loss": 1.0561, + "step": 12262 + }, + { + "epoch": 3.722264380027318, + "grad_norm": 1.1562353372573853, + "learning_rate": 3.797205629239648e-05, + "loss": 0.978, + "step": 12263 + }, + { + "epoch": 3.7225679162240097, + "grad_norm": 1.1052519083023071, + "learning_rate": 3.7966994026526274e-05, + "loss": 0.8918, + "step": 12264 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 1.2539854049682617, + "learning_rate": 3.796193176065607e-05, + "loss": 0.801, + "step": 12265 + }, + { + "epoch": 3.7231749886173926, + "grad_norm": 1.009560465812683, + "learning_rate": 3.795686949478587e-05, + "loss": 1.0034, + "step": 12266 + }, + { + "epoch": 3.723478524814084, + "grad_norm": 1.2761421203613281, + "learning_rate": 3.7951807228915666e-05, + "loss": 0.8267, + "step": 12267 + }, + { + "epoch": 3.7237820610107755, + "grad_norm": 1.091572642326355, + "learning_rate": 3.794674496304546e-05, + "loss": 0.9635, + "step": 12268 + }, + { + "epoch": 3.724085597207467, + "grad_norm": 1.1496751308441162, + "learning_rate": 3.7941682697175255e-05, + "loss": 0.8326, + "step": 12269 + }, + { + "epoch": 3.7243891334041583, + "grad_norm": 1.069031000137329, + "learning_rate": 3.793662043130505e-05, + "loss": 0.9477, + "step": 12270 + }, + { + "epoch": 3.72469266960085, + "grad_norm": 0.9979076981544495, + "learning_rate": 3.793155816543485e-05, + "loss": 1.198, + "step": 12271 + }, + { + "epoch": 3.724996205797541, + "grad_norm": 1.1072041988372803, + "learning_rate": 3.7926495899564646e-05, + "loss": 1.1679, + "step": 12272 + }, + { + "epoch": 3.725299741994233, + "grad_norm": 0.7915745973587036, + "learning_rate": 3.792143363369444e-05, + "loss": 0.7252, + "step": 12273 + }, + { + "epoch": 3.725603278190924, + "grad_norm": 0.9336658120155334, + "learning_rate": 3.7916371367824236e-05, + "loss": 1.1329, + "step": 12274 + }, + { + "epoch": 3.7259068143876157, + "grad_norm": 0.9334734678268433, + "learning_rate": 3.791130910195404e-05, + "loss": 0.915, + "step": 12275 + }, + { + "epoch": 3.7262103505843074, + "grad_norm": 1.0774483680725098, + "learning_rate": 3.790624683608383e-05, + "loss": 1.1978, + "step": 12276 + }, + { + "epoch": 3.7265138867809986, + "grad_norm": 0.9118005037307739, + "learning_rate": 3.790118457021363e-05, + "loss": 1.388, + "step": 12277 + }, + { + "epoch": 3.7268174229776903, + "grad_norm": 1.2302980422973633, + "learning_rate": 3.789612230434342e-05, + "loss": 0.8122, + "step": 12278 + }, + { + "epoch": 3.7271209591743815, + "grad_norm": 1.3205320835113525, + "learning_rate": 3.7891060038473223e-05, + "loss": 0.4794, + "step": 12279 + }, + { + "epoch": 3.727424495371073, + "grad_norm": 1.1351262331008911, + "learning_rate": 3.788599777260302e-05, + "loss": 0.8818, + "step": 12280 + }, + { + "epoch": 3.7277280315677643, + "grad_norm": 1.0000221729278564, + "learning_rate": 3.788093550673282e-05, + "loss": 1.1966, + "step": 12281 + }, + { + "epoch": 3.728031567764456, + "grad_norm": 1.1909509897232056, + "learning_rate": 3.7875873240862615e-05, + "loss": 0.887, + "step": 12282 + }, + { + "epoch": 3.728335103961147, + "grad_norm": 1.0884264707565308, + "learning_rate": 3.787081097499241e-05, + "loss": 0.977, + "step": 12283 + }, + { + "epoch": 3.728638640157839, + "grad_norm": 1.5935136079788208, + "learning_rate": 3.7865748709122204e-05, + "loss": 0.8703, + "step": 12284 + }, + { + "epoch": 3.72894217635453, + "grad_norm": 1.012306571006775, + "learning_rate": 3.7860686443252006e-05, + "loss": 0.5538, + "step": 12285 + }, + { + "epoch": 3.7292457125512217, + "grad_norm": 0.7689440846443176, + "learning_rate": 3.78556241773818e-05, + "loss": 1.3235, + "step": 12286 + }, + { + "epoch": 3.7295492487479134, + "grad_norm": 0.9987410306930542, + "learning_rate": 3.7850561911511595e-05, + "loss": 0.7981, + "step": 12287 + }, + { + "epoch": 3.7298527849446046, + "grad_norm": 0.7312904000282288, + "learning_rate": 3.784549964564139e-05, + "loss": 0.6987, + "step": 12288 + }, + { + "epoch": 3.7301563211412962, + "grad_norm": 0.8435193300247192, + "learning_rate": 3.7840437379771185e-05, + "loss": 1.0323, + "step": 12289 + }, + { + "epoch": 3.7304598573379875, + "grad_norm": 1.1054636240005493, + "learning_rate": 3.7835375113900987e-05, + "loss": 0.452, + "step": 12290 + }, + { + "epoch": 3.730763393534679, + "grad_norm": 1.0118950605392456, + "learning_rate": 3.783031284803078e-05, + "loss": 0.8221, + "step": 12291 + }, + { + "epoch": 3.7310669297313703, + "grad_norm": 1.2656505107879639, + "learning_rate": 3.7825250582160576e-05, + "loss": 0.9625, + "step": 12292 + }, + { + "epoch": 3.731370465928062, + "grad_norm": 1.0235183238983154, + "learning_rate": 3.782018831629037e-05, + "loss": 1.0347, + "step": 12293 + }, + { + "epoch": 3.7316740021247536, + "grad_norm": 1.0891318321228027, + "learning_rate": 3.7815126050420166e-05, + "loss": 0.964, + "step": 12294 + }, + { + "epoch": 3.731977538321445, + "grad_norm": 1.1121217012405396, + "learning_rate": 3.781006378454997e-05, + "loss": 0.8754, + "step": 12295 + }, + { + "epoch": 3.732281074518136, + "grad_norm": 0.9746992588043213, + "learning_rate": 3.780500151867976e-05, + "loss": 1.0176, + "step": 12296 + }, + { + "epoch": 3.7325846107148277, + "grad_norm": 1.114150881767273, + "learning_rate": 3.779993925280956e-05, + "loss": 0.8039, + "step": 12297 + }, + { + "epoch": 3.7328881469115194, + "grad_norm": 1.0842515230178833, + "learning_rate": 3.779487698693935e-05, + "loss": 1.2914, + "step": 12298 + }, + { + "epoch": 3.7331916831082106, + "grad_norm": 0.933716893196106, + "learning_rate": 3.778981472106915e-05, + "loss": 0.7127, + "step": 12299 + }, + { + "epoch": 3.7334952193049022, + "grad_norm": 0.8623145818710327, + "learning_rate": 3.778475245519895e-05, + "loss": 0.3632, + "step": 12300 + }, + { + "epoch": 3.7337987555015935, + "grad_norm": 1.659529447555542, + "learning_rate": 3.777969018932875e-05, + "loss": 0.7509, + "step": 12301 + }, + { + "epoch": 3.734102291698285, + "grad_norm": 1.2075748443603516, + "learning_rate": 3.7774627923458544e-05, + "loss": 0.9864, + "step": 12302 + }, + { + "epoch": 3.7344058278949763, + "grad_norm": 1.193941354751587, + "learning_rate": 3.776956565758834e-05, + "loss": 0.7785, + "step": 12303 + }, + { + "epoch": 3.734709364091668, + "grad_norm": 1.2211413383483887, + "learning_rate": 3.7764503391718134e-05, + "loss": 0.532, + "step": 12304 + }, + { + "epoch": 3.7350129002883596, + "grad_norm": 1.1631566286087036, + "learning_rate": 3.7759441125847936e-05, + "loss": 1.1321, + "step": 12305 + }, + { + "epoch": 3.735316436485051, + "grad_norm": 1.2028287649154663, + "learning_rate": 3.775437885997773e-05, + "loss": 0.8627, + "step": 12306 + }, + { + "epoch": 3.735619972681742, + "grad_norm": 0.9525676369667053, + "learning_rate": 3.7749316594107525e-05, + "loss": 0.8658, + "step": 12307 + }, + { + "epoch": 3.7359235088784337, + "grad_norm": 0.8350377678871155, + "learning_rate": 3.774425432823732e-05, + "loss": 0.93, + "step": 12308 + }, + { + "epoch": 3.7362270450751254, + "grad_norm": 0.9071472883224487, + "learning_rate": 3.7739192062367115e-05, + "loss": 1.2249, + "step": 12309 + }, + { + "epoch": 3.7365305812718166, + "grad_norm": 0.9279691576957703, + "learning_rate": 3.7734129796496916e-05, + "loss": 0.7732, + "step": 12310 + }, + { + "epoch": 3.7368341174685082, + "grad_norm": 1.0739396810531616, + "learning_rate": 3.772906753062671e-05, + "loss": 1.0016, + "step": 12311 + }, + { + "epoch": 3.7371376536651995, + "grad_norm": 0.8073605895042419, + "learning_rate": 3.7724005264756506e-05, + "loss": 0.9366, + "step": 12312 + }, + { + "epoch": 3.737441189861891, + "grad_norm": 1.0669395923614502, + "learning_rate": 3.77189429988863e-05, + "loss": 0.5514, + "step": 12313 + }, + { + "epoch": 3.7377447260585823, + "grad_norm": 0.9138970375061035, + "learning_rate": 3.77138807330161e-05, + "loss": 0.6799, + "step": 12314 + }, + { + "epoch": 3.738048262255274, + "grad_norm": 1.1454130411148071, + "learning_rate": 3.77088184671459e-05, + "loss": 0.682, + "step": 12315 + }, + { + "epoch": 3.7383517984519656, + "grad_norm": 0.9921127557754517, + "learning_rate": 3.770375620127569e-05, + "loss": 0.9169, + "step": 12316 + }, + { + "epoch": 3.738655334648657, + "grad_norm": 0.9758110642433167, + "learning_rate": 3.769869393540549e-05, + "loss": 0.7134, + "step": 12317 + }, + { + "epoch": 3.738958870845348, + "grad_norm": 0.9450392723083496, + "learning_rate": 3.769363166953528e-05, + "loss": 1.2267, + "step": 12318 + }, + { + "epoch": 3.7392624070420397, + "grad_norm": 1.2100883722305298, + "learning_rate": 3.768856940366508e-05, + "loss": 0.8968, + "step": 12319 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 1.1040831804275513, + "learning_rate": 3.768350713779488e-05, + "loss": 0.7309, + "step": 12320 + }, + { + "epoch": 3.7398694794354226, + "grad_norm": 0.9729800224304199, + "learning_rate": 3.767844487192467e-05, + "loss": 1.1333, + "step": 12321 + }, + { + "epoch": 3.7401730156321142, + "grad_norm": 0.8472018241882324, + "learning_rate": 3.767338260605447e-05, + "loss": 1.3026, + "step": 12322 + }, + { + "epoch": 3.7404765518288055, + "grad_norm": 0.9644724726676941, + "learning_rate": 3.766832034018427e-05, + "loss": 1.1609, + "step": 12323 + }, + { + "epoch": 3.740780088025497, + "grad_norm": 1.176712989807129, + "learning_rate": 3.766325807431407e-05, + "loss": 0.8099, + "step": 12324 + }, + { + "epoch": 3.7410836242221883, + "grad_norm": 1.2292200326919556, + "learning_rate": 3.7658195808443865e-05, + "loss": 0.8108, + "step": 12325 + }, + { + "epoch": 3.74138716041888, + "grad_norm": 1.1356682777404785, + "learning_rate": 3.765313354257366e-05, + "loss": 1.0149, + "step": 12326 + }, + { + "epoch": 3.7416906966155716, + "grad_norm": 0.89659583568573, + "learning_rate": 3.7648071276703455e-05, + "loss": 0.9273, + "step": 12327 + }, + { + "epoch": 3.741994232812263, + "grad_norm": 1.140885829925537, + "learning_rate": 3.764300901083325e-05, + "loss": 0.9227, + "step": 12328 + }, + { + "epoch": 3.742297769008954, + "grad_norm": 1.204717993736267, + "learning_rate": 3.763794674496305e-05, + "loss": 0.6891, + "step": 12329 + }, + { + "epoch": 3.7426013052056457, + "grad_norm": 0.7554873824119568, + "learning_rate": 3.7632884479092846e-05, + "loss": 0.7479, + "step": 12330 + }, + { + "epoch": 3.7429048414023374, + "grad_norm": 0.9779210686683655, + "learning_rate": 3.762782221322264e-05, + "loss": 0.5233, + "step": 12331 + }, + { + "epoch": 3.7432083775990286, + "grad_norm": 0.9296281337738037, + "learning_rate": 3.7622759947352436e-05, + "loss": 1.0964, + "step": 12332 + }, + { + "epoch": 3.7435119137957202, + "grad_norm": 1.014298915863037, + "learning_rate": 3.761769768148223e-05, + "loss": 1.0756, + "step": 12333 + }, + { + "epoch": 3.7438154499924114, + "grad_norm": 0.9647025465965271, + "learning_rate": 3.761263541561203e-05, + "loss": 1.3927, + "step": 12334 + }, + { + "epoch": 3.744118986189103, + "grad_norm": 1.0647622346878052, + "learning_rate": 3.760757314974183e-05, + "loss": 0.4717, + "step": 12335 + }, + { + "epoch": 3.7444225223857943, + "grad_norm": 1.0516674518585205, + "learning_rate": 3.760251088387162e-05, + "loss": 1.2176, + "step": 12336 + }, + { + "epoch": 3.744726058582486, + "grad_norm": 1.0718040466308594, + "learning_rate": 3.7597448618001416e-05, + "loss": 0.6752, + "step": 12337 + }, + { + "epoch": 3.7450295947791776, + "grad_norm": 1.1309189796447754, + "learning_rate": 3.759238635213122e-05, + "loss": 1.0383, + "step": 12338 + }, + { + "epoch": 3.745333130975869, + "grad_norm": 1.2614867687225342, + "learning_rate": 3.758732408626101e-05, + "loss": 0.7633, + "step": 12339 + }, + { + "epoch": 3.74563666717256, + "grad_norm": 1.054693341255188, + "learning_rate": 3.758226182039081e-05, + "loss": 0.8525, + "step": 12340 + }, + { + "epoch": 3.7459402033692517, + "grad_norm": 1.1138081550598145, + "learning_rate": 3.75771995545206e-05, + "loss": 1.2251, + "step": 12341 + }, + { + "epoch": 3.7462437395659434, + "grad_norm": 1.1078453063964844, + "learning_rate": 3.75721372886504e-05, + "loss": 1.182, + "step": 12342 + }, + { + "epoch": 3.7465472757626346, + "grad_norm": 1.276255488395691, + "learning_rate": 3.75670750227802e-05, + "loss": 0.8127, + "step": 12343 + }, + { + "epoch": 3.7468508119593262, + "grad_norm": 1.1695194244384766, + "learning_rate": 3.7562012756909994e-05, + "loss": 0.7834, + "step": 12344 + }, + { + "epoch": 3.7471543481560174, + "grad_norm": 1.1870728731155396, + "learning_rate": 3.755695049103979e-05, + "loss": 0.8002, + "step": 12345 + }, + { + "epoch": 3.747457884352709, + "grad_norm": 0.9569132924079895, + "learning_rate": 3.755188822516959e-05, + "loss": 1.6116, + "step": 12346 + }, + { + "epoch": 3.7477614205494003, + "grad_norm": 1.167601466178894, + "learning_rate": 3.7546825959299385e-05, + "loss": 1.2596, + "step": 12347 + }, + { + "epoch": 3.748064956746092, + "grad_norm": 1.03522789478302, + "learning_rate": 3.754176369342918e-05, + "loss": 0.9699, + "step": 12348 + }, + { + "epoch": 3.7483684929427836, + "grad_norm": 1.072169303894043, + "learning_rate": 3.753670142755898e-05, + "loss": 1.1518, + "step": 12349 + }, + { + "epoch": 3.748672029139475, + "grad_norm": 0.9569816589355469, + "learning_rate": 3.7531639161688776e-05, + "loss": 1.0596, + "step": 12350 + }, + { + "epoch": 3.7489755653361665, + "grad_norm": 1.2347333431243896, + "learning_rate": 3.752657689581857e-05, + "loss": 1.2727, + "step": 12351 + }, + { + "epoch": 3.7492791015328577, + "grad_norm": 0.9841341376304626, + "learning_rate": 3.7521514629948365e-05, + "loss": 1.1204, + "step": 12352 + }, + { + "epoch": 3.7495826377295494, + "grad_norm": 0.9956382513046265, + "learning_rate": 3.751645236407817e-05, + "loss": 1.0603, + "step": 12353 + }, + { + "epoch": 3.7498861739262406, + "grad_norm": 1.1098010540008545, + "learning_rate": 3.751139009820796e-05, + "loss": 1.0862, + "step": 12354 + }, + { + "epoch": 3.7501897101229322, + "grad_norm": 0.9175969958305359, + "learning_rate": 3.7506327832337757e-05, + "loss": 0.72, + "step": 12355 + }, + { + "epoch": 3.750493246319624, + "grad_norm": 1.0878384113311768, + "learning_rate": 3.750126556646755e-05, + "loss": 0.8651, + "step": 12356 + }, + { + "epoch": 3.750796782516315, + "grad_norm": 1.253101110458374, + "learning_rate": 3.7496203300597346e-05, + "loss": 0.8921, + "step": 12357 + }, + { + "epoch": 3.7511003187130063, + "grad_norm": 0.8898255825042725, + "learning_rate": 3.749114103472715e-05, + "loss": 1.364, + "step": 12358 + }, + { + "epoch": 3.751403854909698, + "grad_norm": 1.1155585050582886, + "learning_rate": 3.748607876885694e-05, + "loss": 0.857, + "step": 12359 + }, + { + "epoch": 3.7517073911063896, + "grad_norm": 1.171615481376648, + "learning_rate": 3.748101650298674e-05, + "loss": 1.1296, + "step": 12360 + }, + { + "epoch": 3.752010927303081, + "grad_norm": 1.175969123840332, + "learning_rate": 3.747595423711653e-05, + "loss": 1.1668, + "step": 12361 + }, + { + "epoch": 3.7523144634997725, + "grad_norm": 1.1900206804275513, + "learning_rate": 3.747089197124633e-05, + "loss": 0.9471, + "step": 12362 + }, + { + "epoch": 3.7526179996964637, + "grad_norm": 0.9277837872505188, + "learning_rate": 3.746582970537613e-05, + "loss": 0.9782, + "step": 12363 + }, + { + "epoch": 3.7529215358931554, + "grad_norm": 0.922542929649353, + "learning_rate": 3.746076743950592e-05, + "loss": 0.3285, + "step": 12364 + }, + { + "epoch": 3.7532250720898466, + "grad_norm": 1.132167935371399, + "learning_rate": 3.745570517363572e-05, + "loss": 1.0414, + "step": 12365 + }, + { + "epoch": 3.7535286082865382, + "grad_norm": 1.1880006790161133, + "learning_rate": 3.745064290776551e-05, + "loss": 0.9354, + "step": 12366 + }, + { + "epoch": 3.75383214448323, + "grad_norm": 1.2487422227859497, + "learning_rate": 3.7445580641895314e-05, + "loss": 0.9507, + "step": 12367 + }, + { + "epoch": 3.754135680679921, + "grad_norm": 0.8833310604095459, + "learning_rate": 3.7440518376025116e-05, + "loss": 0.5118, + "step": 12368 + }, + { + "epoch": 3.7544392168766123, + "grad_norm": 1.1800538301467896, + "learning_rate": 3.743545611015491e-05, + "loss": 0.9198, + "step": 12369 + }, + { + "epoch": 3.754742753073304, + "grad_norm": 0.9771001935005188, + "learning_rate": 3.7430393844284706e-05, + "loss": 0.3563, + "step": 12370 + }, + { + "epoch": 3.7550462892699956, + "grad_norm": 1.09959077835083, + "learning_rate": 3.74253315784145e-05, + "loss": 0.6128, + "step": 12371 + }, + { + "epoch": 3.755349825466687, + "grad_norm": 1.3109769821166992, + "learning_rate": 3.7420269312544295e-05, + "loss": 0.8962, + "step": 12372 + }, + { + "epoch": 3.7556533616633785, + "grad_norm": 0.9994189739227295, + "learning_rate": 3.74152070466741e-05, + "loss": 1.0178, + "step": 12373 + }, + { + "epoch": 3.7559568978600697, + "grad_norm": 1.0077166557312012, + "learning_rate": 3.741014478080389e-05, + "loss": 1.2622, + "step": 12374 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 1.1626579761505127, + "learning_rate": 3.7405082514933686e-05, + "loss": 0.8729, + "step": 12375 + }, + { + "epoch": 3.7565639702534526, + "grad_norm": 0.8556549549102783, + "learning_rate": 3.740002024906348e-05, + "loss": 0.8845, + "step": 12376 + }, + { + "epoch": 3.7568675064501442, + "grad_norm": 1.2458155155181885, + "learning_rate": 3.739495798319328e-05, + "loss": 1.1022, + "step": 12377 + }, + { + "epoch": 3.757171042646836, + "grad_norm": 0.8295066952705383, + "learning_rate": 3.738989571732308e-05, + "loss": 0.9566, + "step": 12378 + }, + { + "epoch": 3.757474578843527, + "grad_norm": 0.9808529615402222, + "learning_rate": 3.738483345145287e-05, + "loss": 1.0131, + "step": 12379 + }, + { + "epoch": 3.7577781150402183, + "grad_norm": 1.0494447946548462, + "learning_rate": 3.737977118558267e-05, + "loss": 0.6017, + "step": 12380 + }, + { + "epoch": 3.75808165123691, + "grad_norm": 0.9639920592308044, + "learning_rate": 3.737470891971246e-05, + "loss": 0.6646, + "step": 12381 + }, + { + "epoch": 3.7583851874336016, + "grad_norm": 0.9149377942085266, + "learning_rate": 3.7369646653842263e-05, + "loss": 0.7613, + "step": 12382 + }, + { + "epoch": 3.758688723630293, + "grad_norm": 1.0218437910079956, + "learning_rate": 3.736458438797206e-05, + "loss": 0.9877, + "step": 12383 + }, + { + "epoch": 3.7589922598269845, + "grad_norm": 0.9344059228897095, + "learning_rate": 3.735952212210185e-05, + "loss": 1.1299, + "step": 12384 + }, + { + "epoch": 3.7592957960236757, + "grad_norm": 1.2185781002044678, + "learning_rate": 3.735445985623165e-05, + "loss": 0.9772, + "step": 12385 + }, + { + "epoch": 3.7595993322203674, + "grad_norm": 0.8353180289268494, + "learning_rate": 3.734939759036144e-05, + "loss": 1.1411, + "step": 12386 + }, + { + "epoch": 3.7599028684170586, + "grad_norm": 1.3387069702148438, + "learning_rate": 3.7344335324491244e-05, + "loss": 0.7524, + "step": 12387 + }, + { + "epoch": 3.76020640461375, + "grad_norm": 0.901246964931488, + "learning_rate": 3.733927305862104e-05, + "loss": 0.9927, + "step": 12388 + }, + { + "epoch": 3.760509940810442, + "grad_norm": 1.0288785696029663, + "learning_rate": 3.7334210792750834e-05, + "loss": 1.1281, + "step": 12389 + }, + { + "epoch": 3.760813477007133, + "grad_norm": 0.9363031983375549, + "learning_rate": 3.7329148526880635e-05, + "loss": 1.0534, + "step": 12390 + }, + { + "epoch": 3.7611170132038243, + "grad_norm": 0.8846297264099121, + "learning_rate": 3.732408626101043e-05, + "loss": 0.8599, + "step": 12391 + }, + { + "epoch": 3.761420549400516, + "grad_norm": 0.8425638675689697, + "learning_rate": 3.731902399514023e-05, + "loss": 0.9676, + "step": 12392 + }, + { + "epoch": 3.7617240855972076, + "grad_norm": 1.0794261693954468, + "learning_rate": 3.7313961729270027e-05, + "loss": 0.7099, + "step": 12393 + }, + { + "epoch": 3.762027621793899, + "grad_norm": 0.879723072052002, + "learning_rate": 3.730889946339982e-05, + "loss": 0.395, + "step": 12394 + }, + { + "epoch": 3.7623311579905905, + "grad_norm": 1.1528816223144531, + "learning_rate": 3.7303837197529616e-05, + "loss": 0.7806, + "step": 12395 + }, + { + "epoch": 3.7626346941872817, + "grad_norm": 0.8738396763801575, + "learning_rate": 3.729877493165941e-05, + "loss": 1.3985, + "step": 12396 + }, + { + "epoch": 3.7629382303839733, + "grad_norm": 1.070351004600525, + "learning_rate": 3.729371266578921e-05, + "loss": 0.9392, + "step": 12397 + }, + { + "epoch": 3.7632417665806646, + "grad_norm": 1.1746009588241577, + "learning_rate": 3.728865039991901e-05, + "loss": 1.0452, + "step": 12398 + }, + { + "epoch": 3.763545302777356, + "grad_norm": 1.0018872022628784, + "learning_rate": 3.72835881340488e-05, + "loss": 0.6711, + "step": 12399 + }, + { + "epoch": 3.763848838974048, + "grad_norm": 1.1107298135757446, + "learning_rate": 3.72785258681786e-05, + "loss": 1.0681, + "step": 12400 + }, + { + "epoch": 3.764152375170739, + "grad_norm": 1.1544559001922607, + "learning_rate": 3.727346360230839e-05, + "loss": 0.8748, + "step": 12401 + }, + { + "epoch": 3.7644559113674303, + "grad_norm": 0.9350548386573792, + "learning_rate": 3.726840133643819e-05, + "loss": 1.1471, + "step": 12402 + }, + { + "epoch": 3.764759447564122, + "grad_norm": 0.8423395752906799, + "learning_rate": 3.726333907056799e-05, + "loss": 0.8202, + "step": 12403 + }, + { + "epoch": 3.7650629837608136, + "grad_norm": 0.8564968705177307, + "learning_rate": 3.725827680469778e-05, + "loss": 0.7742, + "step": 12404 + }, + { + "epoch": 3.765366519957505, + "grad_norm": 0.8022836446762085, + "learning_rate": 3.725321453882758e-05, + "loss": 1.6734, + "step": 12405 + }, + { + "epoch": 3.7656700561541965, + "grad_norm": 0.5397734642028809, + "learning_rate": 3.724815227295738e-05, + "loss": 0.3346, + "step": 12406 + }, + { + "epoch": 3.7659735923508877, + "grad_norm": 1.2643107175827026, + "learning_rate": 3.7243090007087174e-05, + "loss": 1.098, + "step": 12407 + }, + { + "epoch": 3.7662771285475793, + "grad_norm": 1.0797953605651855, + "learning_rate": 3.723802774121697e-05, + "loss": 1.1159, + "step": 12408 + }, + { + "epoch": 3.7665806647442706, + "grad_norm": 1.0689975023269653, + "learning_rate": 3.7232965475346764e-05, + "loss": 1.0167, + "step": 12409 + }, + { + "epoch": 3.766884200940962, + "grad_norm": 0.9162440299987793, + "learning_rate": 3.722790320947656e-05, + "loss": 0.4559, + "step": 12410 + }, + { + "epoch": 3.767187737137654, + "grad_norm": 1.250971794128418, + "learning_rate": 3.722284094360636e-05, + "loss": 0.7829, + "step": 12411 + }, + { + "epoch": 3.767491273334345, + "grad_norm": 1.2255014181137085, + "learning_rate": 3.721777867773616e-05, + "loss": 0.6269, + "step": 12412 + }, + { + "epoch": 3.7677948095310367, + "grad_norm": 0.872760534286499, + "learning_rate": 3.7212716411865956e-05, + "loss": 1.6415, + "step": 12413 + }, + { + "epoch": 3.768098345727728, + "grad_norm": 1.116316318511963, + "learning_rate": 3.720765414599575e-05, + "loss": 0.8914, + "step": 12414 + }, + { + "epoch": 3.7684018819244196, + "grad_norm": 1.148911476135254, + "learning_rate": 3.7202591880125546e-05, + "loss": 0.8778, + "step": 12415 + }, + { + "epoch": 3.768705418121111, + "grad_norm": 1.2708046436309814, + "learning_rate": 3.719752961425535e-05, + "loss": 0.7843, + "step": 12416 + }, + { + "epoch": 3.7690089543178025, + "grad_norm": 0.9904307723045349, + "learning_rate": 3.719246734838514e-05, + "loss": 0.9592, + "step": 12417 + }, + { + "epoch": 3.769312490514494, + "grad_norm": 1.1411200761795044, + "learning_rate": 3.718740508251494e-05, + "loss": 1.0106, + "step": 12418 + }, + { + "epoch": 3.7696160267111853, + "grad_norm": 1.075008749961853, + "learning_rate": 3.718234281664473e-05, + "loss": 0.8395, + "step": 12419 + }, + { + "epoch": 3.7699195629078766, + "grad_norm": 0.8893365263938904, + "learning_rate": 3.717728055077453e-05, + "loss": 1.313, + "step": 12420 + }, + { + "epoch": 3.770223099104568, + "grad_norm": 1.1124544143676758, + "learning_rate": 3.717221828490433e-05, + "loss": 1.2117, + "step": 12421 + }, + { + "epoch": 3.77052663530126, + "grad_norm": 1.2074543237686157, + "learning_rate": 3.716715601903412e-05, + "loss": 0.9904, + "step": 12422 + }, + { + "epoch": 3.770830171497951, + "grad_norm": 0.9946209192276001, + "learning_rate": 3.716209375316392e-05, + "loss": 1.1174, + "step": 12423 + }, + { + "epoch": 3.7711337076946427, + "grad_norm": 1.1267931461334229, + "learning_rate": 3.715703148729371e-05, + "loss": 0.9255, + "step": 12424 + }, + { + "epoch": 3.771437243891334, + "grad_norm": 0.9244359731674194, + "learning_rate": 3.715196922142351e-05, + "loss": 0.521, + "step": 12425 + }, + { + "epoch": 3.7717407800880256, + "grad_norm": 1.164851188659668, + "learning_rate": 3.714690695555331e-05, + "loss": 0.9026, + "step": 12426 + }, + { + "epoch": 3.772044316284717, + "grad_norm": 1.1922281980514526, + "learning_rate": 3.7141844689683104e-05, + "loss": 0.8423, + "step": 12427 + }, + { + "epoch": 3.7723478524814085, + "grad_norm": 1.0533910989761353, + "learning_rate": 3.71367824238129e-05, + "loss": 0.8489, + "step": 12428 + }, + { + "epoch": 3.7726513886781, + "grad_norm": 0.9692511558532715, + "learning_rate": 3.713172015794269e-05, + "loss": 0.8059, + "step": 12429 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 1.400119662284851, + "learning_rate": 3.7126657892072495e-05, + "loss": 0.7337, + "step": 12430 + }, + { + "epoch": 3.7732584610714826, + "grad_norm": 0.8697136640548706, + "learning_rate": 3.712159562620229e-05, + "loss": 1.4546, + "step": 12431 + }, + { + "epoch": 3.773561997268174, + "grad_norm": 1.0246466398239136, + "learning_rate": 3.7116533360332085e-05, + "loss": 1.1457, + "step": 12432 + }, + { + "epoch": 3.773865533464866, + "grad_norm": 1.0675830841064453, + "learning_rate": 3.711147109446188e-05, + "loss": 1.1213, + "step": 12433 + }, + { + "epoch": 3.774169069661557, + "grad_norm": 1.2050386667251587, + "learning_rate": 3.7106408828591674e-05, + "loss": 0.7348, + "step": 12434 + }, + { + "epoch": 3.7744726058582487, + "grad_norm": 0.9337054491043091, + "learning_rate": 3.7101346562721476e-05, + "loss": 0.6828, + "step": 12435 + }, + { + "epoch": 3.77477614205494, + "grad_norm": 1.0316612720489502, + "learning_rate": 3.709628429685128e-05, + "loss": 0.6499, + "step": 12436 + }, + { + "epoch": 3.7750796782516316, + "grad_norm": 1.2939015626907349, + "learning_rate": 3.709122203098107e-05, + "loss": 0.9652, + "step": 12437 + }, + { + "epoch": 3.775383214448323, + "grad_norm": 1.1328450441360474, + "learning_rate": 3.708615976511087e-05, + "loss": 0.6091, + "step": 12438 + }, + { + "epoch": 3.7756867506450145, + "grad_norm": 1.0616259574890137, + "learning_rate": 3.708109749924066e-05, + "loss": 1.0478, + "step": 12439 + }, + { + "epoch": 3.775990286841706, + "grad_norm": 0.8670310974121094, + "learning_rate": 3.7076035233370456e-05, + "loss": 0.7452, + "step": 12440 + }, + { + "epoch": 3.7762938230383973, + "grad_norm": 0.9887956380844116, + "learning_rate": 3.707097296750026e-05, + "loss": 0.6839, + "step": 12441 + }, + { + "epoch": 3.7765973592350885, + "grad_norm": 0.9409323334693909, + "learning_rate": 3.706591070163005e-05, + "loss": 0.6177, + "step": 12442 + }, + { + "epoch": 3.77690089543178, + "grad_norm": 1.085896611213684, + "learning_rate": 3.706084843575985e-05, + "loss": 0.984, + "step": 12443 + }, + { + "epoch": 3.777204431628472, + "grad_norm": 1.2605316638946533, + "learning_rate": 3.705578616988964e-05, + "loss": 0.6324, + "step": 12444 + }, + { + "epoch": 3.777507967825163, + "grad_norm": 0.8040452599525452, + "learning_rate": 3.7050723904019444e-05, + "loss": 0.9934, + "step": 12445 + }, + { + "epoch": 3.7778115040218547, + "grad_norm": 0.9774128198623657, + "learning_rate": 3.704566163814924e-05, + "loss": 0.6604, + "step": 12446 + }, + { + "epoch": 3.778115040218546, + "grad_norm": 0.9219763875007629, + "learning_rate": 3.7040599372279034e-05, + "loss": 0.8763, + "step": 12447 + }, + { + "epoch": 3.7784185764152376, + "grad_norm": 1.030827522277832, + "learning_rate": 3.703553710640883e-05, + "loss": 0.8398, + "step": 12448 + }, + { + "epoch": 3.778722112611929, + "grad_norm": 1.1267133951187134, + "learning_rate": 3.703047484053862e-05, + "loss": 0.9505, + "step": 12449 + }, + { + "epoch": 3.7790256488086205, + "grad_norm": 1.189131736755371, + "learning_rate": 3.7025412574668425e-05, + "loss": 0.9392, + "step": 12450 + }, + { + "epoch": 3.779329185005312, + "grad_norm": 1.247646450996399, + "learning_rate": 3.702035030879822e-05, + "loss": 0.7988, + "step": 12451 + }, + { + "epoch": 3.7796327212020033, + "grad_norm": 0.9839622974395752, + "learning_rate": 3.7015288042928014e-05, + "loss": 0.6276, + "step": 12452 + }, + { + "epoch": 3.7799362573986945, + "grad_norm": 1.194143533706665, + "learning_rate": 3.701022577705781e-05, + "loss": 1.0647, + "step": 12453 + }, + { + "epoch": 3.780239793595386, + "grad_norm": 1.1474366188049316, + "learning_rate": 3.7005163511187604e-05, + "loss": 0.949, + "step": 12454 + }, + { + "epoch": 3.780543329792078, + "grad_norm": 1.1925603151321411, + "learning_rate": 3.7000101245317405e-05, + "loss": 0.6462, + "step": 12455 + }, + { + "epoch": 3.780846865988769, + "grad_norm": 1.0543676614761353, + "learning_rate": 3.69950389794472e-05, + "loss": 1.1766, + "step": 12456 + }, + { + "epoch": 3.7811504021854607, + "grad_norm": 0.9596462249755859, + "learning_rate": 3.6989976713577e-05, + "loss": 0.9552, + "step": 12457 + }, + { + "epoch": 3.781453938382152, + "grad_norm": 0.9737072587013245, + "learning_rate": 3.69849144477068e-05, + "loss": 0.9835, + "step": 12458 + }, + { + "epoch": 3.7817574745788436, + "grad_norm": 0.9812677502632141, + "learning_rate": 3.697985218183659e-05, + "loss": 0.9774, + "step": 12459 + }, + { + "epoch": 3.782061010775535, + "grad_norm": 1.186084270477295, + "learning_rate": 3.697478991596639e-05, + "loss": 0.9106, + "step": 12460 + }, + { + "epoch": 3.7823645469722265, + "grad_norm": 1.3368810415267944, + "learning_rate": 3.696972765009619e-05, + "loss": 0.7, + "step": 12461 + }, + { + "epoch": 3.782668083168918, + "grad_norm": 1.206284761428833, + "learning_rate": 3.696466538422598e-05, + "loss": 0.978, + "step": 12462 + }, + { + "epoch": 3.7829716193656093, + "grad_norm": 1.0115655660629272, + "learning_rate": 3.695960311835578e-05, + "loss": 0.9621, + "step": 12463 + }, + { + "epoch": 3.7832751555623005, + "grad_norm": 1.0173842906951904, + "learning_rate": 3.695454085248557e-05, + "loss": 1.0791, + "step": 12464 + }, + { + "epoch": 3.783578691758992, + "grad_norm": 1.065743088722229, + "learning_rate": 3.6949478586615374e-05, + "loss": 1.0654, + "step": 12465 + }, + { + "epoch": 3.783882227955684, + "grad_norm": 0.9688126444816589, + "learning_rate": 3.694441632074517e-05, + "loss": 0.8817, + "step": 12466 + }, + { + "epoch": 3.784185764152375, + "grad_norm": 1.136647343635559, + "learning_rate": 3.693935405487496e-05, + "loss": 0.5064, + "step": 12467 + }, + { + "epoch": 3.7844893003490667, + "grad_norm": 1.0706472396850586, + "learning_rate": 3.693429178900476e-05, + "loss": 1.003, + "step": 12468 + }, + { + "epoch": 3.784792836545758, + "grad_norm": 1.1312788724899292, + "learning_rate": 3.692922952313456e-05, + "loss": 0.5844, + "step": 12469 + }, + { + "epoch": 3.7850963727424496, + "grad_norm": 0.9325472712516785, + "learning_rate": 3.6924167257264354e-05, + "loss": 1.1911, + "step": 12470 + }, + { + "epoch": 3.785399908939141, + "grad_norm": 0.8451688885688782, + "learning_rate": 3.691910499139415e-05, + "loss": 1.3916, + "step": 12471 + }, + { + "epoch": 3.7857034451358325, + "grad_norm": 0.6755526065826416, + "learning_rate": 3.6914042725523944e-05, + "loss": 1.066, + "step": 12472 + }, + { + "epoch": 3.786006981332524, + "grad_norm": 0.9279147386550903, + "learning_rate": 3.690898045965374e-05, + "loss": 0.6723, + "step": 12473 + }, + { + "epoch": 3.7863105175292153, + "grad_norm": 1.0263065099716187, + "learning_rate": 3.690391819378354e-05, + "loss": 0.6167, + "step": 12474 + }, + { + "epoch": 3.7866140537259065, + "grad_norm": 1.1038944721221924, + "learning_rate": 3.6898855927913335e-05, + "loss": 0.8708, + "step": 12475 + }, + { + "epoch": 3.786917589922598, + "grad_norm": 0.5807709097862244, + "learning_rate": 3.689379366204313e-05, + "loss": 1.0277, + "step": 12476 + }, + { + "epoch": 3.78722112611929, + "grad_norm": 1.2162609100341797, + "learning_rate": 3.6888731396172925e-05, + "loss": 0.9676, + "step": 12477 + }, + { + "epoch": 3.787524662315981, + "grad_norm": 1.2073674201965332, + "learning_rate": 3.688366913030272e-05, + "loss": 0.7189, + "step": 12478 + }, + { + "epoch": 3.7878281985126727, + "grad_norm": 1.0670849084854126, + "learning_rate": 3.687860686443252e-05, + "loss": 1.1586, + "step": 12479 + }, + { + "epoch": 3.788131734709364, + "grad_norm": 0.9557695984840393, + "learning_rate": 3.687354459856232e-05, + "loss": 0.5827, + "step": 12480 + }, + { + "epoch": 3.7884352709060556, + "grad_norm": 0.9041797518730164, + "learning_rate": 3.686848233269212e-05, + "loss": 1.0817, + "step": 12481 + }, + { + "epoch": 3.788738807102747, + "grad_norm": 0.9924010634422302, + "learning_rate": 3.686342006682191e-05, + "loss": 0.8866, + "step": 12482 + }, + { + "epoch": 3.7890423432994385, + "grad_norm": 0.9935707449913025, + "learning_rate": 3.685835780095171e-05, + "loss": 1.0209, + "step": 12483 + }, + { + "epoch": 3.78934587949613, + "grad_norm": 0.9138877987861633, + "learning_rate": 3.685329553508151e-05, + "loss": 1.4457, + "step": 12484 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.8974517583847046, + "learning_rate": 3.6848233269211304e-05, + "loss": 0.8027, + "step": 12485 + }, + { + "epoch": 3.789952951889513, + "grad_norm": 1.0018976926803589, + "learning_rate": 3.68431710033411e-05, + "loss": 1.1763, + "step": 12486 + }, + { + "epoch": 3.790256488086204, + "grad_norm": 1.0108580589294434, + "learning_rate": 3.683810873747089e-05, + "loss": 1.6347, + "step": 12487 + }, + { + "epoch": 3.790560024282896, + "grad_norm": 1.2577271461486816, + "learning_rate": 3.683304647160069e-05, + "loss": 0.4809, + "step": 12488 + }, + { + "epoch": 3.790863560479587, + "grad_norm": 1.0133095979690552, + "learning_rate": 3.682798420573049e-05, + "loss": 1.0019, + "step": 12489 + }, + { + "epoch": 3.7911670966762787, + "grad_norm": 1.0451486110687256, + "learning_rate": 3.6822921939860284e-05, + "loss": 0.9594, + "step": 12490 + }, + { + "epoch": 3.7914706328729704, + "grad_norm": 0.8475046157836914, + "learning_rate": 3.681785967399008e-05, + "loss": 0.8753, + "step": 12491 + }, + { + "epoch": 3.7917741690696616, + "grad_norm": 0.5309485793113708, + "learning_rate": 3.6812797408119874e-05, + "loss": 0.8365, + "step": 12492 + }, + { + "epoch": 3.792077705266353, + "grad_norm": 1.0600450038909912, + "learning_rate": 3.680773514224967e-05, + "loss": 1.2161, + "step": 12493 + }, + { + "epoch": 3.7923812414630445, + "grad_norm": 1.2394740581512451, + "learning_rate": 3.680267287637947e-05, + "loss": 0.8221, + "step": 12494 + }, + { + "epoch": 3.792684777659736, + "grad_norm": 1.2992359399795532, + "learning_rate": 3.6797610610509265e-05, + "loss": 0.9025, + "step": 12495 + }, + { + "epoch": 3.7929883138564273, + "grad_norm": 1.0954234600067139, + "learning_rate": 3.679254834463906e-05, + "loss": 1.0929, + "step": 12496 + }, + { + "epoch": 3.793291850053119, + "grad_norm": 1.4118071794509888, + "learning_rate": 3.6787486078768855e-05, + "loss": 0.5356, + "step": 12497 + }, + { + "epoch": 3.79359538624981, + "grad_norm": 0.8720954060554504, + "learning_rate": 3.6782423812898656e-05, + "loss": 0.8325, + "step": 12498 + }, + { + "epoch": 3.793898922446502, + "grad_norm": 1.042973279953003, + "learning_rate": 3.677736154702845e-05, + "loss": 1.141, + "step": 12499 + }, + { + "epoch": 3.794202458643193, + "grad_norm": 1.0451314449310303, + "learning_rate": 3.6772299281158246e-05, + "loss": 0.9965, + "step": 12500 + }, + { + "epoch": 3.7945059948398847, + "grad_norm": 0.8057799339294434, + "learning_rate": 3.676723701528805e-05, + "loss": 0.3991, + "step": 12501 + }, + { + "epoch": 3.7948095310365764, + "grad_norm": 1.0030001401901245, + "learning_rate": 3.676217474941784e-05, + "loss": 0.6943, + "step": 12502 + }, + { + "epoch": 3.7951130672332676, + "grad_norm": 1.1071726083755493, + "learning_rate": 3.675711248354764e-05, + "loss": 0.8517, + "step": 12503 + }, + { + "epoch": 3.795416603429959, + "grad_norm": 0.8781371116638184, + "learning_rate": 3.675205021767744e-05, + "loss": 0.7278, + "step": 12504 + }, + { + "epoch": 3.7957201396266504, + "grad_norm": 1.134009599685669, + "learning_rate": 3.674698795180723e-05, + "loss": 0.8761, + "step": 12505 + }, + { + "epoch": 3.796023675823342, + "grad_norm": 0.7201947569847107, + "learning_rate": 3.674192568593703e-05, + "loss": 0.5329, + "step": 12506 + }, + { + "epoch": 3.7963272120200333, + "grad_norm": 1.2298204898834229, + "learning_rate": 3.673686342006682e-05, + "loss": 0.715, + "step": 12507 + }, + { + "epoch": 3.796630748216725, + "grad_norm": 1.0398313999176025, + "learning_rate": 3.6731801154196624e-05, + "loss": 0.9505, + "step": 12508 + }, + { + "epoch": 3.796934284413416, + "grad_norm": 1.0274169445037842, + "learning_rate": 3.672673888832642e-05, + "loss": 0.7832, + "step": 12509 + }, + { + "epoch": 3.797237820610108, + "grad_norm": 0.9675783514976501, + "learning_rate": 3.6721676622456214e-05, + "loss": 0.9994, + "step": 12510 + }, + { + "epoch": 3.797541356806799, + "grad_norm": 0.9080022573471069, + "learning_rate": 3.671661435658601e-05, + "loss": 1.3438, + "step": 12511 + }, + { + "epoch": 3.7978448930034907, + "grad_norm": 1.1123250722885132, + "learning_rate": 3.6711552090715804e-05, + "loss": 1.0362, + "step": 12512 + }, + { + "epoch": 3.7981484292001824, + "grad_norm": 1.0559964179992676, + "learning_rate": 3.6706489824845605e-05, + "loss": 1.0324, + "step": 12513 + }, + { + "epoch": 3.7984519653968736, + "grad_norm": 0.880066990852356, + "learning_rate": 3.67014275589754e-05, + "loss": 0.7476, + "step": 12514 + }, + { + "epoch": 3.798755501593565, + "grad_norm": 1.1622546911239624, + "learning_rate": 3.6696365293105195e-05, + "loss": 0.706, + "step": 12515 + }, + { + "epoch": 3.7990590377902564, + "grad_norm": 1.081712245941162, + "learning_rate": 3.669130302723499e-05, + "loss": 1.1936, + "step": 12516 + }, + { + "epoch": 3.799362573986948, + "grad_norm": 0.9707673192024231, + "learning_rate": 3.6686240761364784e-05, + "loss": 1.3008, + "step": 12517 + }, + { + "epoch": 3.7996661101836393, + "grad_norm": 1.1115853786468506, + "learning_rate": 3.6681178495494586e-05, + "loss": 0.8095, + "step": 12518 + }, + { + "epoch": 3.799969646380331, + "grad_norm": 1.011857271194458, + "learning_rate": 3.667611622962438e-05, + "loss": 1.2693, + "step": 12519 + }, + { + "epoch": 3.800273182577022, + "grad_norm": 1.0544657707214355, + "learning_rate": 3.6671053963754176e-05, + "loss": 0.7931, + "step": 12520 + }, + { + "epoch": 3.800576718773714, + "grad_norm": 0.7806882858276367, + "learning_rate": 3.666599169788397e-05, + "loss": 0.6562, + "step": 12521 + }, + { + "epoch": 3.800880254970405, + "grad_norm": 0.8974125981330872, + "learning_rate": 3.666092943201377e-05, + "loss": 1.0769, + "step": 12522 + }, + { + "epoch": 3.8011837911670967, + "grad_norm": 1.239742636680603, + "learning_rate": 3.665586716614357e-05, + "loss": 1.0355, + "step": 12523 + }, + { + "epoch": 3.8014873273637884, + "grad_norm": 1.1388022899627686, + "learning_rate": 3.665080490027337e-05, + "loss": 0.8971, + "step": 12524 + }, + { + "epoch": 3.8017908635604796, + "grad_norm": 1.069403886795044, + "learning_rate": 3.664574263440316e-05, + "loss": 0.9201, + "step": 12525 + }, + { + "epoch": 3.802094399757171, + "grad_norm": 0.9443103075027466, + "learning_rate": 3.664068036853296e-05, + "loss": 1.0174, + "step": 12526 + }, + { + "epoch": 3.8023979359538624, + "grad_norm": 0.8539696335792542, + "learning_rate": 3.663561810266275e-05, + "loss": 1.2065, + "step": 12527 + }, + { + "epoch": 3.802701472150554, + "grad_norm": 0.9192683100700378, + "learning_rate": 3.6630555836792554e-05, + "loss": 1.1406, + "step": 12528 + }, + { + "epoch": 3.8030050083472453, + "grad_norm": 1.1776856184005737, + "learning_rate": 3.662549357092235e-05, + "loss": 0.8645, + "step": 12529 + }, + { + "epoch": 3.803308544543937, + "grad_norm": 1.1186996698379517, + "learning_rate": 3.6620431305052144e-05, + "loss": 0.711, + "step": 12530 + }, + { + "epoch": 3.803612080740628, + "grad_norm": 1.0336960554122925, + "learning_rate": 3.661536903918194e-05, + "loss": 1.184, + "step": 12531 + }, + { + "epoch": 3.80391561693732, + "grad_norm": 1.1766363382339478, + "learning_rate": 3.6610306773311733e-05, + "loss": 0.7574, + "step": 12532 + }, + { + "epoch": 3.804219153134011, + "grad_norm": 0.8887657523155212, + "learning_rate": 3.6605244507441535e-05, + "loss": 0.8771, + "step": 12533 + }, + { + "epoch": 3.8045226893307027, + "grad_norm": 0.9370740652084351, + "learning_rate": 3.660018224157133e-05, + "loss": 1.0695, + "step": 12534 + }, + { + "epoch": 3.8048262255273944, + "grad_norm": 1.2718757390975952, + "learning_rate": 3.6595119975701125e-05, + "loss": 1.097, + "step": 12535 + }, + { + "epoch": 3.8051297617240856, + "grad_norm": 0.7982632517814636, + "learning_rate": 3.659005770983092e-05, + "loss": 0.958, + "step": 12536 + }, + { + "epoch": 3.805433297920777, + "grad_norm": 1.3425947427749634, + "learning_rate": 3.658499544396072e-05, + "loss": 0.6905, + "step": 12537 + }, + { + "epoch": 3.8057368341174684, + "grad_norm": 1.2812447547912598, + "learning_rate": 3.6579933178090516e-05, + "loss": 0.7141, + "step": 12538 + }, + { + "epoch": 3.80604037031416, + "grad_norm": 0.9885139465332031, + "learning_rate": 3.657487091222031e-05, + "loss": 1.088, + "step": 12539 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 1.1644903421401978, + "learning_rate": 3.6569808646350105e-05, + "loss": 1.1386, + "step": 12540 + }, + { + "epoch": 3.806647442707543, + "grad_norm": 0.9741769433021545, + "learning_rate": 3.65647463804799e-05, + "loss": 1.1833, + "step": 12541 + }, + { + "epoch": 3.806950978904234, + "grad_norm": 1.158400297164917, + "learning_rate": 3.65596841146097e-05, + "loss": 0.7546, + "step": 12542 + }, + { + "epoch": 3.807254515100926, + "grad_norm": 1.3073914051055908, + "learning_rate": 3.6554621848739496e-05, + "loss": 1.0833, + "step": 12543 + }, + { + "epoch": 3.807558051297617, + "grad_norm": 1.3367284536361694, + "learning_rate": 3.654955958286929e-05, + "loss": 0.8433, + "step": 12544 + }, + { + "epoch": 3.8078615874943087, + "grad_norm": 0.9839133024215698, + "learning_rate": 3.6544497316999086e-05, + "loss": 0.8277, + "step": 12545 + }, + { + "epoch": 3.8081651236910004, + "grad_norm": 0.9218907356262207, + "learning_rate": 3.653943505112889e-05, + "loss": 1.0215, + "step": 12546 + }, + { + "epoch": 3.8084686598876916, + "grad_norm": 0.6923597455024719, + "learning_rate": 3.653437278525869e-05, + "loss": 0.6621, + "step": 12547 + }, + { + "epoch": 3.808772196084383, + "grad_norm": 1.0883673429489136, + "learning_rate": 3.6529310519388484e-05, + "loss": 1.1138, + "step": 12548 + }, + { + "epoch": 3.8090757322810744, + "grad_norm": 1.3069573640823364, + "learning_rate": 3.652424825351828e-05, + "loss": 0.9807, + "step": 12549 + }, + { + "epoch": 3.809379268477766, + "grad_norm": 1.0207620859146118, + "learning_rate": 3.6519185987648074e-05, + "loss": 1.1417, + "step": 12550 + }, + { + "epoch": 3.8096828046744573, + "grad_norm": 1.184693694114685, + "learning_rate": 3.651412372177787e-05, + "loss": 0.6101, + "step": 12551 + }, + { + "epoch": 3.809986340871149, + "grad_norm": 1.1946877241134644, + "learning_rate": 3.650906145590767e-05, + "loss": 1.2366, + "step": 12552 + }, + { + "epoch": 3.8102898770678406, + "grad_norm": 1.2500317096710205, + "learning_rate": 3.6503999190037465e-05, + "loss": 0.7129, + "step": 12553 + }, + { + "epoch": 3.810593413264532, + "grad_norm": 1.0332715511322021, + "learning_rate": 3.649893692416726e-05, + "loss": 1.0753, + "step": 12554 + }, + { + "epoch": 3.810896949461223, + "grad_norm": 1.1311501264572144, + "learning_rate": 3.6493874658297054e-05, + "loss": 1.1275, + "step": 12555 + }, + { + "epoch": 3.8112004856579147, + "grad_norm": 1.0839035511016846, + "learning_rate": 3.648881239242685e-05, + "loss": 1.0938, + "step": 12556 + }, + { + "epoch": 3.8115040218546063, + "grad_norm": 0.9822790622711182, + "learning_rate": 3.648375012655665e-05, + "loss": 0.7078, + "step": 12557 + }, + { + "epoch": 3.8118075580512976, + "grad_norm": 1.138361930847168, + "learning_rate": 3.6478687860686445e-05, + "loss": 1.0619, + "step": 12558 + }, + { + "epoch": 3.812111094247989, + "grad_norm": 0.8938875198364258, + "learning_rate": 3.647362559481624e-05, + "loss": 0.9597, + "step": 12559 + }, + { + "epoch": 3.8124146304446804, + "grad_norm": 0.9459108114242554, + "learning_rate": 3.6468563328946035e-05, + "loss": 1.1845, + "step": 12560 + }, + { + "epoch": 3.812718166641372, + "grad_norm": 0.786181628704071, + "learning_rate": 3.646350106307584e-05, + "loss": 1.4033, + "step": 12561 + }, + { + "epoch": 3.8130217028380633, + "grad_norm": 1.0209492444992065, + "learning_rate": 3.645843879720563e-05, + "loss": 0.8995, + "step": 12562 + }, + { + "epoch": 3.813325239034755, + "grad_norm": 1.0225709676742554, + "learning_rate": 3.6453376531335426e-05, + "loss": 0.7914, + "step": 12563 + }, + { + "epoch": 3.8136287752314466, + "grad_norm": 1.0575718879699707, + "learning_rate": 3.644831426546522e-05, + "loss": 1.1255, + "step": 12564 + }, + { + "epoch": 3.813932311428138, + "grad_norm": 0.9127454161643982, + "learning_rate": 3.6443251999595016e-05, + "loss": 0.6872, + "step": 12565 + }, + { + "epoch": 3.814235847624829, + "grad_norm": 1.0032671689987183, + "learning_rate": 3.643818973372482e-05, + "loss": 1.2576, + "step": 12566 + }, + { + "epoch": 3.8145393838215207, + "grad_norm": 1.03171706199646, + "learning_rate": 3.643312746785461e-05, + "loss": 0.886, + "step": 12567 + }, + { + "epoch": 3.8148429200182123, + "grad_norm": 0.9096865057945251, + "learning_rate": 3.6428065201984414e-05, + "loss": 0.3693, + "step": 12568 + }, + { + "epoch": 3.8151464562149036, + "grad_norm": 0.9535962343215942, + "learning_rate": 3.642300293611421e-05, + "loss": 0.9618, + "step": 12569 + }, + { + "epoch": 3.815449992411595, + "grad_norm": 0.9836147427558899, + "learning_rate": 3.6417940670244e-05, + "loss": 0.5016, + "step": 12570 + }, + { + "epoch": 3.8157535286082864, + "grad_norm": 0.9265391826629639, + "learning_rate": 3.64128784043738e-05, + "loss": 1.1488, + "step": 12571 + }, + { + "epoch": 3.816057064804978, + "grad_norm": 0.9965578317642212, + "learning_rate": 3.64078161385036e-05, + "loss": 0.8073, + "step": 12572 + }, + { + "epoch": 3.8163606010016693, + "grad_norm": 0.8847346901893616, + "learning_rate": 3.6402753872633395e-05, + "loss": 1.1673, + "step": 12573 + }, + { + "epoch": 3.816664137198361, + "grad_norm": 0.9536630511283875, + "learning_rate": 3.639769160676319e-05, + "loss": 0.8438, + "step": 12574 + }, + { + "epoch": 3.8169676733950526, + "grad_norm": 1.1465344429016113, + "learning_rate": 3.6392629340892984e-05, + "loss": 1.4059, + "step": 12575 + }, + { + "epoch": 3.817271209591744, + "grad_norm": 1.121108055114746, + "learning_rate": 3.6387567075022786e-05, + "loss": 0.7204, + "step": 12576 + }, + { + "epoch": 3.817574745788435, + "grad_norm": 0.9842544198036194, + "learning_rate": 3.638250480915258e-05, + "loss": 1.2694, + "step": 12577 + }, + { + "epoch": 3.8178782819851267, + "grad_norm": 1.0562217235565186, + "learning_rate": 3.6377442543282375e-05, + "loss": 0.6263, + "step": 12578 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 0.9709679484367371, + "learning_rate": 3.637238027741217e-05, + "loss": 0.6173, + "step": 12579 + }, + { + "epoch": 3.8184853543785096, + "grad_norm": 1.116947054862976, + "learning_rate": 3.6367318011541965e-05, + "loss": 1.0854, + "step": 12580 + }, + { + "epoch": 3.818788890575201, + "grad_norm": 1.1855547428131104, + "learning_rate": 3.6362255745671766e-05, + "loss": 1.0473, + "step": 12581 + }, + { + "epoch": 3.8190924267718924, + "grad_norm": 0.8418704867362976, + "learning_rate": 3.635719347980156e-05, + "loss": 1.0964, + "step": 12582 + }, + { + "epoch": 3.819395962968584, + "grad_norm": 1.182165503501892, + "learning_rate": 3.6352131213931356e-05, + "loss": 0.8107, + "step": 12583 + }, + { + "epoch": 3.8196994991652753, + "grad_norm": 1.1919047832489014, + "learning_rate": 3.634706894806115e-05, + "loss": 0.7338, + "step": 12584 + }, + { + "epoch": 3.820003035361967, + "grad_norm": 1.0663739442825317, + "learning_rate": 3.6342006682190946e-05, + "loss": 0.9889, + "step": 12585 + }, + { + "epoch": 3.8203065715586586, + "grad_norm": 1.1770719289779663, + "learning_rate": 3.633694441632075e-05, + "loss": 0.9773, + "step": 12586 + }, + { + "epoch": 3.82061010775535, + "grad_norm": 0.7286142706871033, + "learning_rate": 3.633188215045054e-05, + "loss": 1.3875, + "step": 12587 + }, + { + "epoch": 3.820913643952041, + "grad_norm": 1.1708937883377075, + "learning_rate": 3.632681988458034e-05, + "loss": 0.8601, + "step": 12588 + }, + { + "epoch": 3.8212171801487327, + "grad_norm": 0.8510751724243164, + "learning_rate": 3.632175761871013e-05, + "loss": 1.0468, + "step": 12589 + }, + { + "epoch": 3.8215207163454243, + "grad_norm": 1.0976594686508179, + "learning_rate": 3.631669535283993e-05, + "loss": 0.8098, + "step": 12590 + }, + { + "epoch": 3.8218242525421156, + "grad_norm": 0.8817259669303894, + "learning_rate": 3.6311633086969735e-05, + "loss": 1.1972, + "step": 12591 + }, + { + "epoch": 3.822127788738807, + "grad_norm": 1.0076229572296143, + "learning_rate": 3.630657082109953e-05, + "loss": 1.2062, + "step": 12592 + }, + { + "epoch": 3.8224313249354984, + "grad_norm": 0.9679299592971802, + "learning_rate": 3.6301508555229324e-05, + "loss": 1.0117, + "step": 12593 + }, + { + "epoch": 3.82273486113219, + "grad_norm": 0.8844080567359924, + "learning_rate": 3.629644628935912e-05, + "loss": 0.5204, + "step": 12594 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.9077200889587402, + "learning_rate": 3.6291384023488914e-05, + "loss": 0.4815, + "step": 12595 + }, + { + "epoch": 3.823341933525573, + "grad_norm": 1.237531065940857, + "learning_rate": 3.6286321757618715e-05, + "loss": 0.8014, + "step": 12596 + }, + { + "epoch": 3.8236454697222646, + "grad_norm": 1.063979148864746, + "learning_rate": 3.628125949174851e-05, + "loss": 1.0652, + "step": 12597 + }, + { + "epoch": 3.823949005918956, + "grad_norm": 0.5936614274978638, + "learning_rate": 3.6276197225878305e-05, + "loss": 1.1171, + "step": 12598 + }, + { + "epoch": 3.824252542115647, + "grad_norm": 0.9714362025260925, + "learning_rate": 3.62711349600081e-05, + "loss": 1.0921, + "step": 12599 + }, + { + "epoch": 3.8245560783123387, + "grad_norm": 1.1868579387664795, + "learning_rate": 3.62660726941379e-05, + "loss": 0.6683, + "step": 12600 + }, + { + "epoch": 3.8248596145090303, + "grad_norm": 1.22465181350708, + "learning_rate": 3.6261010428267696e-05, + "loss": 0.6697, + "step": 12601 + }, + { + "epoch": 3.8251631507057215, + "grad_norm": 1.0445573329925537, + "learning_rate": 3.625594816239749e-05, + "loss": 0.9781, + "step": 12602 + }, + { + "epoch": 3.825466686902413, + "grad_norm": 0.9368974566459656, + "learning_rate": 3.6250885896527286e-05, + "loss": 0.9731, + "step": 12603 + }, + { + "epoch": 3.8257702230991044, + "grad_norm": 0.925430953502655, + "learning_rate": 3.624582363065708e-05, + "loss": 0.99, + "step": 12604 + }, + { + "epoch": 3.826073759295796, + "grad_norm": 1.2370201349258423, + "learning_rate": 3.624076136478688e-05, + "loss": 0.9579, + "step": 12605 + }, + { + "epoch": 3.8263772954924873, + "grad_norm": 0.9670636653900146, + "learning_rate": 3.623569909891668e-05, + "loss": 0.8808, + "step": 12606 + }, + { + "epoch": 3.826680831689179, + "grad_norm": 1.3786380290985107, + "learning_rate": 3.623063683304647e-05, + "loss": 0.5776, + "step": 12607 + }, + { + "epoch": 3.8269843678858706, + "grad_norm": 1.0573362112045288, + "learning_rate": 3.6225574567176267e-05, + "loss": 1.1035, + "step": 12608 + }, + { + "epoch": 3.827287904082562, + "grad_norm": 0.9810343384742737, + "learning_rate": 3.622051230130606e-05, + "loss": 0.934, + "step": 12609 + }, + { + "epoch": 3.8275914402792535, + "grad_norm": 1.3955323696136475, + "learning_rate": 3.621545003543586e-05, + "loss": 0.6498, + "step": 12610 + }, + { + "epoch": 3.8278949764759447, + "grad_norm": 0.8948975205421448, + "learning_rate": 3.621038776956566e-05, + "loss": 1.0394, + "step": 12611 + }, + { + "epoch": 3.8281985126726363, + "grad_norm": 1.1089454889297485, + "learning_rate": 3.620532550369545e-05, + "loss": 1.0674, + "step": 12612 + }, + { + "epoch": 3.8285020488693275, + "grad_norm": 1.031445026397705, + "learning_rate": 3.6200263237825254e-05, + "loss": 0.4921, + "step": 12613 + }, + { + "epoch": 3.828805585066019, + "grad_norm": 0.9377485513687134, + "learning_rate": 3.619520097195505e-05, + "loss": 0.8707, + "step": 12614 + }, + { + "epoch": 3.8291091212627104, + "grad_norm": 0.6203951835632324, + "learning_rate": 3.619013870608485e-05, + "loss": 0.6399, + "step": 12615 + }, + { + "epoch": 3.829412657459402, + "grad_norm": 0.9445657730102539, + "learning_rate": 3.6185076440214645e-05, + "loss": 0.6648, + "step": 12616 + }, + { + "epoch": 3.8297161936560933, + "grad_norm": 0.7219453454017639, + "learning_rate": 3.618001417434444e-05, + "loss": 0.9266, + "step": 12617 + }, + { + "epoch": 3.830019729852785, + "grad_norm": 1.0757688283920288, + "learning_rate": 3.6174951908474235e-05, + "loss": 0.7094, + "step": 12618 + }, + { + "epoch": 3.8303232660494766, + "grad_norm": 1.2357137203216553, + "learning_rate": 3.616988964260403e-05, + "loss": 0.9795, + "step": 12619 + }, + { + "epoch": 3.830626802246168, + "grad_norm": 1.2534006834030151, + "learning_rate": 3.616482737673383e-05, + "loss": 0.8512, + "step": 12620 + }, + { + "epoch": 3.8309303384428595, + "grad_norm": 1.0862754583358765, + "learning_rate": 3.6159765110863626e-05, + "loss": 1.0443, + "step": 12621 + }, + { + "epoch": 3.8312338746395507, + "grad_norm": 0.8031094074249268, + "learning_rate": 3.615470284499342e-05, + "loss": 0.9296, + "step": 12622 + }, + { + "epoch": 3.8315374108362423, + "grad_norm": 0.8891496062278748, + "learning_rate": 3.6149640579123216e-05, + "loss": 0.9954, + "step": 12623 + }, + { + "epoch": 3.8318409470329335, + "grad_norm": 0.8501279354095459, + "learning_rate": 3.614457831325301e-05, + "loss": 0.4058, + "step": 12624 + }, + { + "epoch": 3.832144483229625, + "grad_norm": 1.4046821594238281, + "learning_rate": 3.613951604738281e-05, + "loss": 1.0708, + "step": 12625 + }, + { + "epoch": 3.832448019426317, + "grad_norm": 1.003602147102356, + "learning_rate": 3.613445378151261e-05, + "loss": 0.8414, + "step": 12626 + }, + { + "epoch": 3.832751555623008, + "grad_norm": 1.157523512840271, + "learning_rate": 3.61293915156424e-05, + "loss": 0.7318, + "step": 12627 + }, + { + "epoch": 3.8330550918196993, + "grad_norm": 1.124373435974121, + "learning_rate": 3.6124329249772196e-05, + "loss": 0.8934, + "step": 12628 + }, + { + "epoch": 3.833358628016391, + "grad_norm": 0.9450935125350952, + "learning_rate": 3.6119266983902e-05, + "loss": 1.3555, + "step": 12629 + }, + { + "epoch": 3.8336621642130826, + "grad_norm": 0.84016352891922, + "learning_rate": 3.611420471803179e-05, + "loss": 1.4493, + "step": 12630 + }, + { + "epoch": 3.833965700409774, + "grad_norm": 0.9923927783966064, + "learning_rate": 3.610914245216159e-05, + "loss": 0.7335, + "step": 12631 + }, + { + "epoch": 3.8342692366064655, + "grad_norm": 1.0308324098587036, + "learning_rate": 3.610408018629138e-05, + "loss": 1.1851, + "step": 12632 + }, + { + "epoch": 3.8345727728031567, + "grad_norm": 1.1900591850280762, + "learning_rate": 3.609901792042118e-05, + "loss": 0.8612, + "step": 12633 + }, + { + "epoch": 3.8348763089998483, + "grad_norm": 1.0952060222625732, + "learning_rate": 3.609395565455098e-05, + "loss": 0.9591, + "step": 12634 + }, + { + "epoch": 3.8351798451965395, + "grad_norm": 0.9611538052558899, + "learning_rate": 3.608889338868078e-05, + "loss": 0.8086, + "step": 12635 + }, + { + "epoch": 3.835483381393231, + "grad_norm": 1.2563023567199707, + "learning_rate": 3.6083831122810575e-05, + "loss": 0.8626, + "step": 12636 + }, + { + "epoch": 3.835786917589923, + "grad_norm": 1.0185880661010742, + "learning_rate": 3.607876885694037e-05, + "loss": 0.6785, + "step": 12637 + }, + { + "epoch": 3.836090453786614, + "grad_norm": 0.8604440689086914, + "learning_rate": 3.6073706591070165e-05, + "loss": 1.0588, + "step": 12638 + }, + { + "epoch": 3.8363939899833053, + "grad_norm": 1.2018462419509888, + "learning_rate": 3.6068644325199966e-05, + "loss": 0.6916, + "step": 12639 + }, + { + "epoch": 3.836697526179997, + "grad_norm": 1.1518785953521729, + "learning_rate": 3.606358205932976e-05, + "loss": 0.6106, + "step": 12640 + }, + { + "epoch": 3.8370010623766886, + "grad_norm": 1.1723309755325317, + "learning_rate": 3.6058519793459556e-05, + "loss": 0.9625, + "step": 12641 + }, + { + "epoch": 3.83730459857338, + "grad_norm": 1.0593045949935913, + "learning_rate": 3.605345752758935e-05, + "loss": 1.2079, + "step": 12642 + }, + { + "epoch": 3.8376081347700715, + "grad_norm": 1.2983299493789673, + "learning_rate": 3.6048395261719145e-05, + "loss": 0.923, + "step": 12643 + }, + { + "epoch": 3.8379116709667627, + "grad_norm": 1.133760690689087, + "learning_rate": 3.604333299584895e-05, + "loss": 1.0506, + "step": 12644 + }, + { + "epoch": 3.8382152071634543, + "grad_norm": 0.9849115610122681, + "learning_rate": 3.603827072997874e-05, + "loss": 1.1203, + "step": 12645 + }, + { + "epoch": 3.8385187433601455, + "grad_norm": 1.1651870012283325, + "learning_rate": 3.6033208464108536e-05, + "loss": 0.7526, + "step": 12646 + }, + { + "epoch": 3.838822279556837, + "grad_norm": 0.859154462814331, + "learning_rate": 3.602814619823833e-05, + "loss": 1.1796, + "step": 12647 + }, + { + "epoch": 3.839125815753529, + "grad_norm": 0.7726252675056458, + "learning_rate": 3.6023083932368126e-05, + "loss": 0.4717, + "step": 12648 + }, + { + "epoch": 3.83942935195022, + "grad_norm": 1.0019550323486328, + "learning_rate": 3.601802166649793e-05, + "loss": 0.6599, + "step": 12649 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 1.1222416162490845, + "learning_rate": 3.601295940062772e-05, + "loss": 1.1301, + "step": 12650 + }, + { + "epoch": 3.840036424343603, + "grad_norm": 0.8488395810127258, + "learning_rate": 3.600789713475752e-05, + "loss": 0.9742, + "step": 12651 + }, + { + "epoch": 3.8403399605402946, + "grad_norm": 0.9683934450149536, + "learning_rate": 3.600283486888731e-05, + "loss": 1.2505, + "step": 12652 + }, + { + "epoch": 3.840643496736986, + "grad_norm": 1.2314308881759644, + "learning_rate": 3.5997772603017114e-05, + "loss": 0.6376, + "step": 12653 + }, + { + "epoch": 3.8409470329336775, + "grad_norm": 0.8215912580490112, + "learning_rate": 3.599271033714691e-05, + "loss": 0.58, + "step": 12654 + }, + { + "epoch": 3.8412505691303687, + "grad_norm": 0.7688278555870056, + "learning_rate": 3.59876480712767e-05, + "loss": 1.4386, + "step": 12655 + }, + { + "epoch": 3.8415541053270603, + "grad_norm": 0.8174803256988525, + "learning_rate": 3.59825858054065e-05, + "loss": 0.5328, + "step": 12656 + }, + { + "epoch": 3.8418576415237515, + "grad_norm": 0.967585027217865, + "learning_rate": 3.59775235395363e-05, + "loss": 1.3622, + "step": 12657 + }, + { + "epoch": 3.842161177720443, + "grad_norm": 0.9755575060844421, + "learning_rate": 3.5972461273666094e-05, + "loss": 0.7753, + "step": 12658 + }, + { + "epoch": 3.842464713917135, + "grad_norm": 1.1214314699172974, + "learning_rate": 3.5967399007795896e-05, + "loss": 1.111, + "step": 12659 + }, + { + "epoch": 3.842768250113826, + "grad_norm": 1.0460931062698364, + "learning_rate": 3.596233674192569e-05, + "loss": 0.6251, + "step": 12660 + }, + { + "epoch": 3.8430717863105173, + "grad_norm": 1.2175486087799072, + "learning_rate": 3.5957274476055486e-05, + "loss": 0.7381, + "step": 12661 + }, + { + "epoch": 3.843375322507209, + "grad_norm": 0.8440338373184204, + "learning_rate": 3.595221221018528e-05, + "loss": 0.9959, + "step": 12662 + }, + { + "epoch": 3.8436788587039006, + "grad_norm": 1.1632927656173706, + "learning_rate": 3.5947149944315075e-05, + "loss": 0.634, + "step": 12663 + }, + { + "epoch": 3.843982394900592, + "grad_norm": 1.2207000255584717, + "learning_rate": 3.594208767844488e-05, + "loss": 0.807, + "step": 12664 + }, + { + "epoch": 3.8442859310972834, + "grad_norm": 1.2405529022216797, + "learning_rate": 3.593702541257467e-05, + "loss": 0.7856, + "step": 12665 + }, + { + "epoch": 3.8445894672939747, + "grad_norm": 1.1385959386825562, + "learning_rate": 3.5931963146704466e-05, + "loss": 0.4606, + "step": 12666 + }, + { + "epoch": 3.8448930034906663, + "grad_norm": 0.9806564450263977, + "learning_rate": 3.592690088083426e-05, + "loss": 1.3657, + "step": 12667 + }, + { + "epoch": 3.8451965396873575, + "grad_norm": 1.2369064092636108, + "learning_rate": 3.592183861496406e-05, + "loss": 0.4777, + "step": 12668 + }, + { + "epoch": 3.845500075884049, + "grad_norm": 1.0237668752670288, + "learning_rate": 3.591677634909386e-05, + "loss": 0.7254, + "step": 12669 + }, + { + "epoch": 3.845803612080741, + "grad_norm": 1.0868148803710938, + "learning_rate": 3.591171408322365e-05, + "loss": 0.8943, + "step": 12670 + }, + { + "epoch": 3.846107148277432, + "grad_norm": 0.8978376388549805, + "learning_rate": 3.590665181735345e-05, + "loss": 1.3045, + "step": 12671 + }, + { + "epoch": 3.8464106844741233, + "grad_norm": 1.1477622985839844, + "learning_rate": 3.590158955148324e-05, + "loss": 0.9715, + "step": 12672 + }, + { + "epoch": 3.846714220670815, + "grad_norm": 1.0696914196014404, + "learning_rate": 3.589652728561304e-05, + "loss": 1.1898, + "step": 12673 + }, + { + "epoch": 3.8470177568675066, + "grad_norm": 1.1978225708007812, + "learning_rate": 3.589146501974284e-05, + "loss": 0.9716, + "step": 12674 + }, + { + "epoch": 3.847321293064198, + "grad_norm": 1.003483772277832, + "learning_rate": 3.588640275387263e-05, + "loss": 0.6681, + "step": 12675 + }, + { + "epoch": 3.8476248292608894, + "grad_norm": 1.2099262475967407, + "learning_rate": 3.588134048800243e-05, + "loss": 0.9879, + "step": 12676 + }, + { + "epoch": 3.8479283654575807, + "grad_norm": 0.5804658532142639, + "learning_rate": 3.587627822213222e-05, + "loss": 1.1456, + "step": 12677 + }, + { + "epoch": 3.8482319016542723, + "grad_norm": 1.3019638061523438, + "learning_rate": 3.5871215956262024e-05, + "loss": 1.0442, + "step": 12678 + }, + { + "epoch": 3.8485354378509635, + "grad_norm": 1.1312780380249023, + "learning_rate": 3.5866153690391826e-05, + "loss": 0.9191, + "step": 12679 + }, + { + "epoch": 3.848838974047655, + "grad_norm": 1.1711311340332031, + "learning_rate": 3.586109142452162e-05, + "loss": 0.6664, + "step": 12680 + }, + { + "epoch": 3.849142510244347, + "grad_norm": 1.1850665807724, + "learning_rate": 3.5856029158651415e-05, + "loss": 0.9452, + "step": 12681 + }, + { + "epoch": 3.849446046441038, + "grad_norm": 1.2346030473709106, + "learning_rate": 3.585096689278121e-05, + "loss": 0.8589, + "step": 12682 + }, + { + "epoch": 3.8497495826377297, + "grad_norm": 0.9113971590995789, + "learning_rate": 3.584590462691101e-05, + "loss": 1.1441, + "step": 12683 + }, + { + "epoch": 3.850053118834421, + "grad_norm": 1.1443860530853271, + "learning_rate": 3.5840842361040806e-05, + "loss": 0.9508, + "step": 12684 + }, + { + "epoch": 3.8503566550311126, + "grad_norm": 0.9857345223426819, + "learning_rate": 3.58357800951706e-05, + "loss": 1.1763, + "step": 12685 + }, + { + "epoch": 3.850660191227804, + "grad_norm": 0.7271543741226196, + "learning_rate": 3.5830717829300396e-05, + "loss": 1.1079, + "step": 12686 + }, + { + "epoch": 3.8509637274244954, + "grad_norm": 1.0298407077789307, + "learning_rate": 3.582565556343019e-05, + "loss": 0.6967, + "step": 12687 + }, + { + "epoch": 3.851267263621187, + "grad_norm": 1.2919042110443115, + "learning_rate": 3.582059329755999e-05, + "loss": 0.2459, + "step": 12688 + }, + { + "epoch": 3.8515707998178783, + "grad_norm": 0.8432283997535706, + "learning_rate": 3.581553103168979e-05, + "loss": 0.792, + "step": 12689 + }, + { + "epoch": 3.8518743360145695, + "grad_norm": 1.0085020065307617, + "learning_rate": 3.581046876581958e-05, + "loss": 1.1587, + "step": 12690 + }, + { + "epoch": 3.852177872211261, + "grad_norm": 1.0004825592041016, + "learning_rate": 3.580540649994938e-05, + "loss": 0.756, + "step": 12691 + }, + { + "epoch": 3.852481408407953, + "grad_norm": 0.6788829565048218, + "learning_rate": 3.580034423407918e-05, + "loss": 0.8687, + "step": 12692 + }, + { + "epoch": 3.852784944604644, + "grad_norm": 1.2750455141067505, + "learning_rate": 3.579528196820897e-05, + "loss": 0.5877, + "step": 12693 + }, + { + "epoch": 3.8530884808013357, + "grad_norm": 1.0672144889831543, + "learning_rate": 3.579021970233877e-05, + "loss": 1.1586, + "step": 12694 + }, + { + "epoch": 3.853392016998027, + "grad_norm": 0.8018614053726196, + "learning_rate": 3.578515743646856e-05, + "loss": 1.2261, + "step": 12695 + }, + { + "epoch": 3.8536955531947186, + "grad_norm": 1.2437891960144043, + "learning_rate": 3.578009517059836e-05, + "loss": 0.5145, + "step": 12696 + }, + { + "epoch": 3.85399908939141, + "grad_norm": 1.1033931970596313, + "learning_rate": 3.577503290472816e-05, + "loss": 1.1441, + "step": 12697 + }, + { + "epoch": 3.8543026255881014, + "grad_norm": 0.9969262480735779, + "learning_rate": 3.5769970638857954e-05, + "loss": 0.7606, + "step": 12698 + }, + { + "epoch": 3.854606161784793, + "grad_norm": 0.9036029577255249, + "learning_rate": 3.576490837298775e-05, + "loss": 1.5952, + "step": 12699 + }, + { + "epoch": 3.8549096979814843, + "grad_norm": 0.7661654949188232, + "learning_rate": 3.5759846107117543e-05, + "loss": 0.6028, + "step": 12700 + }, + { + "epoch": 3.8552132341781755, + "grad_norm": 1.24527108669281, + "learning_rate": 3.575478384124734e-05, + "loss": 0.9086, + "step": 12701 + }, + { + "epoch": 3.855516770374867, + "grad_norm": 0.7893168926239014, + "learning_rate": 3.574972157537714e-05, + "loss": 0.3812, + "step": 12702 + }, + { + "epoch": 3.855820306571559, + "grad_norm": 1.3418463468551636, + "learning_rate": 3.574465930950694e-05, + "loss": 0.9262, + "step": 12703 + }, + { + "epoch": 3.85612384276825, + "grad_norm": 1.3264920711517334, + "learning_rate": 3.5739597043636736e-05, + "loss": 1.2089, + "step": 12704 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.8849372863769531, + "learning_rate": 3.573453477776653e-05, + "loss": 1.2696, + "step": 12705 + }, + { + "epoch": 3.856730915161633, + "grad_norm": 0.795248806476593, + "learning_rate": 3.5729472511896326e-05, + "loss": 1.123, + "step": 12706 + }, + { + "epoch": 3.8570344513583246, + "grad_norm": 0.906143069267273, + "learning_rate": 3.572441024602613e-05, + "loss": 1.4217, + "step": 12707 + }, + { + "epoch": 3.857337987555016, + "grad_norm": 0.8168256282806396, + "learning_rate": 3.571934798015592e-05, + "loss": 0.7287, + "step": 12708 + }, + { + "epoch": 3.8576415237517074, + "grad_norm": 1.0842193365097046, + "learning_rate": 3.571428571428572e-05, + "loss": 0.8874, + "step": 12709 + }, + { + "epoch": 3.857945059948399, + "grad_norm": 0.8586336374282837, + "learning_rate": 3.570922344841551e-05, + "loss": 1.4215, + "step": 12710 + }, + { + "epoch": 3.8582485961450903, + "grad_norm": 0.8054507374763489, + "learning_rate": 3.5704161182545307e-05, + "loss": 0.9708, + "step": 12711 + }, + { + "epoch": 3.8585521323417815, + "grad_norm": 1.0475492477416992, + "learning_rate": 3.569909891667511e-05, + "loss": 0.632, + "step": 12712 + }, + { + "epoch": 3.858855668538473, + "grad_norm": 1.082188367843628, + "learning_rate": 3.56940366508049e-05, + "loss": 1.3935, + "step": 12713 + }, + { + "epoch": 3.859159204735165, + "grad_norm": 1.1209795475006104, + "learning_rate": 3.56889743849347e-05, + "loss": 0.8696, + "step": 12714 + }, + { + "epoch": 3.859462740931856, + "grad_norm": 1.2744150161743164, + "learning_rate": 3.568391211906449e-05, + "loss": 0.946, + "step": 12715 + }, + { + "epoch": 3.8597662771285477, + "grad_norm": 0.9955036044120789, + "learning_rate": 3.567884985319429e-05, + "loss": 0.4893, + "step": 12716 + }, + { + "epoch": 3.860069813325239, + "grad_norm": 0.9003579616546631, + "learning_rate": 3.567378758732409e-05, + "loss": 0.8195, + "step": 12717 + }, + { + "epoch": 3.8603733495219306, + "grad_norm": 1.2418488264083862, + "learning_rate": 3.5668725321453884e-05, + "loss": 0.5796, + "step": 12718 + }, + { + "epoch": 3.8606768857186218, + "grad_norm": 1.1157699823379517, + "learning_rate": 3.566366305558368e-05, + "loss": 1.4453, + "step": 12719 + }, + { + "epoch": 3.8609804219153134, + "grad_norm": 0.9562606811523438, + "learning_rate": 3.565860078971347e-05, + "loss": 1.3648, + "step": 12720 + }, + { + "epoch": 3.861283958112005, + "grad_norm": 1.0574451684951782, + "learning_rate": 3.5653538523843275e-05, + "loss": 1.3278, + "step": 12721 + }, + { + "epoch": 3.8615874943086963, + "grad_norm": 0.9478166103363037, + "learning_rate": 3.564847625797307e-05, + "loss": 1.387, + "step": 12722 + }, + { + "epoch": 3.8618910305053875, + "grad_norm": 1.1048253774642944, + "learning_rate": 3.5643413992102864e-05, + "loss": 0.2592, + "step": 12723 + }, + { + "epoch": 3.862194566702079, + "grad_norm": 1.2735931873321533, + "learning_rate": 3.5638351726232666e-05, + "loss": 0.9707, + "step": 12724 + }, + { + "epoch": 3.862498102898771, + "grad_norm": 0.8279594779014587, + "learning_rate": 3.563328946036246e-05, + "loss": 0.8895, + "step": 12725 + }, + { + "epoch": 3.862801639095462, + "grad_norm": 1.2678651809692383, + "learning_rate": 3.5628227194492256e-05, + "loss": 0.7277, + "step": 12726 + }, + { + "epoch": 3.8631051752921537, + "grad_norm": 0.9743305444717407, + "learning_rate": 3.562316492862206e-05, + "loss": 0.8985, + "step": 12727 + }, + { + "epoch": 3.863408711488845, + "grad_norm": 0.8670724034309387, + "learning_rate": 3.561810266275185e-05, + "loss": 0.6863, + "step": 12728 + }, + { + "epoch": 3.8637122476855366, + "grad_norm": 0.7599443793296814, + "learning_rate": 3.561304039688165e-05, + "loss": 0.9507, + "step": 12729 + }, + { + "epoch": 3.8640157838822278, + "grad_norm": 1.0500792264938354, + "learning_rate": 3.560797813101144e-05, + "loss": 1.198, + "step": 12730 + }, + { + "epoch": 3.8643193200789194, + "grad_norm": 1.2817363739013672, + "learning_rate": 3.560291586514124e-05, + "loss": 0.9271, + "step": 12731 + }, + { + "epoch": 3.864622856275611, + "grad_norm": 1.009155035018921, + "learning_rate": 3.559785359927104e-05, + "loss": 0.6216, + "step": 12732 + }, + { + "epoch": 3.8649263924723023, + "grad_norm": 1.204560399055481, + "learning_rate": 3.559279133340083e-05, + "loss": 1.0542, + "step": 12733 + }, + { + "epoch": 3.8652299286689935, + "grad_norm": 1.0865867137908936, + "learning_rate": 3.558772906753063e-05, + "loss": 1.0922, + "step": 12734 + }, + { + "epoch": 3.865533464865685, + "grad_norm": 0.909726619720459, + "learning_rate": 3.558266680166042e-05, + "loss": 0.4874, + "step": 12735 + }, + { + "epoch": 3.865837001062377, + "grad_norm": 1.1957225799560547, + "learning_rate": 3.5577604535790224e-05, + "loss": 0.7084, + "step": 12736 + }, + { + "epoch": 3.866140537259068, + "grad_norm": 0.9338945150375366, + "learning_rate": 3.557254226992002e-05, + "loss": 1.3624, + "step": 12737 + }, + { + "epoch": 3.8664440734557597, + "grad_norm": 1.1019009351730347, + "learning_rate": 3.5567480004049813e-05, + "loss": 0.7742, + "step": 12738 + }, + { + "epoch": 3.866747609652451, + "grad_norm": 0.9993829727172852, + "learning_rate": 3.556241773817961e-05, + "loss": 0.8857, + "step": 12739 + }, + { + "epoch": 3.8670511458491426, + "grad_norm": 0.9779240489006042, + "learning_rate": 3.55573554723094e-05, + "loss": 0.4003, + "step": 12740 + }, + { + "epoch": 3.8673546820458338, + "grad_norm": 1.233641505241394, + "learning_rate": 3.5552293206439205e-05, + "loss": 0.7118, + "step": 12741 + }, + { + "epoch": 3.8676582182425254, + "grad_norm": 1.2404721975326538, + "learning_rate": 3.5547230940569e-05, + "loss": 0.8568, + "step": 12742 + }, + { + "epoch": 3.867961754439217, + "grad_norm": 0.840156614780426, + "learning_rate": 3.5542168674698794e-05, + "loss": 1.1145, + "step": 12743 + }, + { + "epoch": 3.8682652906359083, + "grad_norm": 1.2218763828277588, + "learning_rate": 3.553710640882859e-05, + "loss": 0.877, + "step": 12744 + }, + { + "epoch": 3.8685688268326, + "grad_norm": 1.2625459432601929, + "learning_rate": 3.553204414295839e-05, + "loss": 0.8955, + "step": 12745 + }, + { + "epoch": 3.868872363029291, + "grad_norm": 0.880547285079956, + "learning_rate": 3.552698187708819e-05, + "loss": 0.8973, + "step": 12746 + }, + { + "epoch": 3.869175899225983, + "grad_norm": 1.1880970001220703, + "learning_rate": 3.552191961121799e-05, + "loss": 0.9699, + "step": 12747 + }, + { + "epoch": 3.869479435422674, + "grad_norm": 1.107420802116394, + "learning_rate": 3.551685734534778e-05, + "loss": 1.2462, + "step": 12748 + }, + { + "epoch": 3.8697829716193657, + "grad_norm": 0.7863233089447021, + "learning_rate": 3.5511795079477577e-05, + "loss": 1.0645, + "step": 12749 + }, + { + "epoch": 3.8700865078160573, + "grad_norm": 1.011460542678833, + "learning_rate": 3.550673281360737e-05, + "loss": 0.9295, + "step": 12750 + }, + { + "epoch": 3.8703900440127486, + "grad_norm": 1.1474519968032837, + "learning_rate": 3.550167054773717e-05, + "loss": 0.9369, + "step": 12751 + }, + { + "epoch": 3.8706935802094398, + "grad_norm": 1.0394400358200073, + "learning_rate": 3.549660828186697e-05, + "loss": 0.8281, + "step": 12752 + }, + { + "epoch": 3.8709971164061314, + "grad_norm": 0.887823760509491, + "learning_rate": 3.549154601599676e-05, + "loss": 0.7503, + "step": 12753 + }, + { + "epoch": 3.871300652602823, + "grad_norm": 1.338474988937378, + "learning_rate": 3.548648375012656e-05, + "loss": 0.7491, + "step": 12754 + }, + { + "epoch": 3.8716041887995143, + "grad_norm": 1.1262630224227905, + "learning_rate": 3.548142148425635e-05, + "loss": 1.2889, + "step": 12755 + }, + { + "epoch": 3.871907724996206, + "grad_norm": 1.190826416015625, + "learning_rate": 3.5476359218386154e-05, + "loss": 0.9196, + "step": 12756 + }, + { + "epoch": 3.872211261192897, + "grad_norm": 1.133587121963501, + "learning_rate": 3.547129695251595e-05, + "loss": 1.28, + "step": 12757 + }, + { + "epoch": 3.872514797389589, + "grad_norm": 1.1030386686325073, + "learning_rate": 3.546623468664574e-05, + "loss": 0.9556, + "step": 12758 + }, + { + "epoch": 3.87281833358628, + "grad_norm": 1.0884510278701782, + "learning_rate": 3.546117242077554e-05, + "loss": 1.1316, + "step": 12759 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 1.196638822555542, + "learning_rate": 3.545611015490534e-05, + "loss": 0.838, + "step": 12760 + }, + { + "epoch": 3.8734254059796633, + "grad_norm": 0.9907419681549072, + "learning_rate": 3.5451047889035134e-05, + "loss": 0.7401, + "step": 12761 + }, + { + "epoch": 3.8737289421763546, + "grad_norm": 1.2768141031265259, + "learning_rate": 3.544598562316493e-05, + "loss": 1.0363, + "step": 12762 + }, + { + "epoch": 3.8740324783730458, + "grad_norm": 0.9903162121772766, + "learning_rate": 3.5440923357294724e-05, + "loss": 1.0413, + "step": 12763 + }, + { + "epoch": 3.8743360145697374, + "grad_norm": 1.1590310335159302, + "learning_rate": 3.543586109142452e-05, + "loss": 1.0529, + "step": 12764 + }, + { + "epoch": 3.874639550766429, + "grad_norm": 0.9943473935127258, + "learning_rate": 3.543079882555432e-05, + "loss": 0.8143, + "step": 12765 + }, + { + "epoch": 3.8749430869631203, + "grad_norm": 0.8532429933547974, + "learning_rate": 3.5425736559684115e-05, + "loss": 1.5673, + "step": 12766 + }, + { + "epoch": 3.875246623159812, + "grad_norm": 0.7676200866699219, + "learning_rate": 3.542067429381391e-05, + "loss": 0.955, + "step": 12767 + }, + { + "epoch": 3.875550159356503, + "grad_norm": 1.2963508367538452, + "learning_rate": 3.541561202794371e-05, + "loss": 0.6915, + "step": 12768 + }, + { + "epoch": 3.875853695553195, + "grad_norm": 1.0360219478607178, + "learning_rate": 3.5410549762073506e-05, + "loss": 0.5781, + "step": 12769 + }, + { + "epoch": 3.876157231749886, + "grad_norm": 1.1756477355957031, + "learning_rate": 3.540548749620331e-05, + "loss": 1.0179, + "step": 12770 + }, + { + "epoch": 3.8764607679465777, + "grad_norm": 0.5796340703964233, + "learning_rate": 3.54004252303331e-05, + "loss": 0.0941, + "step": 12771 + }, + { + "epoch": 3.8767643041432693, + "grad_norm": 1.1572502851486206, + "learning_rate": 3.53953629644629e-05, + "loss": 0.6157, + "step": 12772 + }, + { + "epoch": 3.8770678403399605, + "grad_norm": 0.9639040231704712, + "learning_rate": 3.539030069859269e-05, + "loss": 1.1969, + "step": 12773 + }, + { + "epoch": 3.8773713765366518, + "grad_norm": 1.224981427192688, + "learning_rate": 3.538523843272249e-05, + "loss": 0.7806, + "step": 12774 + }, + { + "epoch": 3.8776749127333434, + "grad_norm": 0.8508462905883789, + "learning_rate": 3.538017616685229e-05, + "loss": 0.4107, + "step": 12775 + }, + { + "epoch": 3.877978448930035, + "grad_norm": 1.174686312675476, + "learning_rate": 3.5375113900982083e-05, + "loss": 0.8329, + "step": 12776 + }, + { + "epoch": 3.8782819851267263, + "grad_norm": 1.1415894031524658, + "learning_rate": 3.537005163511188e-05, + "loss": 0.854, + "step": 12777 + }, + { + "epoch": 3.878585521323418, + "grad_norm": 1.0952720642089844, + "learning_rate": 3.536498936924167e-05, + "loss": 0.9908, + "step": 12778 + }, + { + "epoch": 3.878889057520109, + "grad_norm": 1.0311297178268433, + "learning_rate": 3.535992710337147e-05, + "loss": 0.5974, + "step": 12779 + }, + { + "epoch": 3.879192593716801, + "grad_norm": 1.2596989870071411, + "learning_rate": 3.535486483750127e-05, + "loss": 0.8244, + "step": 12780 + }, + { + "epoch": 3.879496129913492, + "grad_norm": 0.9016005992889404, + "learning_rate": 3.5349802571631064e-05, + "loss": 0.926, + "step": 12781 + }, + { + "epoch": 3.8797996661101837, + "grad_norm": 0.8995393514633179, + "learning_rate": 3.534474030576086e-05, + "loss": 1.2649, + "step": 12782 + }, + { + "epoch": 3.8801032023068753, + "grad_norm": 0.9061526656150818, + "learning_rate": 3.5339678039890654e-05, + "loss": 1.1058, + "step": 12783 + }, + { + "epoch": 3.8804067385035665, + "grad_norm": 1.2591369152069092, + "learning_rate": 3.5334615774020455e-05, + "loss": 1.027, + "step": 12784 + }, + { + "epoch": 3.8807102747002578, + "grad_norm": 1.287702202796936, + "learning_rate": 3.532955350815025e-05, + "loss": 0.8173, + "step": 12785 + }, + { + "epoch": 3.8810138108969494, + "grad_norm": 1.2254858016967773, + "learning_rate": 3.5324491242280045e-05, + "loss": 1.2852, + "step": 12786 + }, + { + "epoch": 3.881317347093641, + "grad_norm": 1.1175894737243652, + "learning_rate": 3.531942897640984e-05, + "loss": 0.8759, + "step": 12787 + }, + { + "epoch": 3.8816208832903323, + "grad_norm": 1.1211622953414917, + "learning_rate": 3.5314366710539634e-05, + "loss": 1.166, + "step": 12788 + }, + { + "epoch": 3.881924419487024, + "grad_norm": 0.9377552270889282, + "learning_rate": 3.5309304444669436e-05, + "loss": 0.8671, + "step": 12789 + }, + { + "epoch": 3.882227955683715, + "grad_norm": 1.4808958768844604, + "learning_rate": 3.530424217879923e-05, + "loss": 0.7477, + "step": 12790 + }, + { + "epoch": 3.882531491880407, + "grad_norm": 0.7783199548721313, + "learning_rate": 3.529917991292903e-05, + "loss": 1.3794, + "step": 12791 + }, + { + "epoch": 3.882835028077098, + "grad_norm": 0.888972282409668, + "learning_rate": 3.529411764705883e-05, + "loss": 0.8699, + "step": 12792 + }, + { + "epoch": 3.8831385642737897, + "grad_norm": 1.2149696350097656, + "learning_rate": 3.528905538118862e-05, + "loss": 0.7427, + "step": 12793 + }, + { + "epoch": 3.8834421004704813, + "grad_norm": 1.1536693572998047, + "learning_rate": 3.528399311531842e-05, + "loss": 0.961, + "step": 12794 + }, + { + "epoch": 3.8837456366671725, + "grad_norm": 1.130842685699463, + "learning_rate": 3.527893084944822e-05, + "loss": 1.17, + "step": 12795 + }, + { + "epoch": 3.8840491728638638, + "grad_norm": 1.11892831325531, + "learning_rate": 3.527386858357801e-05, + "loss": 1.1355, + "step": 12796 + }, + { + "epoch": 3.8843527090605554, + "grad_norm": 0.7281229496002197, + "learning_rate": 3.526880631770781e-05, + "loss": 0.9892, + "step": 12797 + }, + { + "epoch": 3.884656245257247, + "grad_norm": 1.0583444833755493, + "learning_rate": 3.52637440518376e-05, + "loss": 1.2847, + "step": 12798 + }, + { + "epoch": 3.8849597814539383, + "grad_norm": 1.1315759420394897, + "learning_rate": 3.5258681785967404e-05, + "loss": 1.055, + "step": 12799 + }, + { + "epoch": 3.88526331765063, + "grad_norm": 1.0264074802398682, + "learning_rate": 3.52536195200972e-05, + "loss": 1.0165, + "step": 12800 + }, + { + "epoch": 3.885566853847321, + "grad_norm": 0.978118896484375, + "learning_rate": 3.5248557254226994e-05, + "loss": 0.7709, + "step": 12801 + }, + { + "epoch": 3.885870390044013, + "grad_norm": 1.0030839443206787, + "learning_rate": 3.524349498835679e-05, + "loss": 0.7807, + "step": 12802 + }, + { + "epoch": 3.886173926240704, + "grad_norm": 1.0307530164718628, + "learning_rate": 3.5238432722486584e-05, + "loss": 0.5768, + "step": 12803 + }, + { + "epoch": 3.8864774624373957, + "grad_norm": 0.9714303016662598, + "learning_rate": 3.5233370456616385e-05, + "loss": 1.1965, + "step": 12804 + }, + { + "epoch": 3.8867809986340873, + "grad_norm": 1.2063406705856323, + "learning_rate": 3.522830819074618e-05, + "loss": 0.8643, + "step": 12805 + }, + { + "epoch": 3.8870845348307785, + "grad_norm": 1.2458761930465698, + "learning_rate": 3.5223245924875975e-05, + "loss": 0.8129, + "step": 12806 + }, + { + "epoch": 3.8873880710274697, + "grad_norm": 1.1465457677841187, + "learning_rate": 3.521818365900577e-05, + "loss": 0.8979, + "step": 12807 + }, + { + "epoch": 3.8876916072241614, + "grad_norm": 0.7686108350753784, + "learning_rate": 3.5213121393135564e-05, + "loss": 0.6285, + "step": 12808 + }, + { + "epoch": 3.887995143420853, + "grad_norm": 0.9141403436660767, + "learning_rate": 3.5208059127265366e-05, + "loss": 0.7176, + "step": 12809 + }, + { + "epoch": 3.8882986796175443, + "grad_norm": 1.1234393119812012, + "learning_rate": 3.520299686139516e-05, + "loss": 1.2368, + "step": 12810 + }, + { + "epoch": 3.888602215814236, + "grad_norm": 1.275098443031311, + "learning_rate": 3.5197934595524955e-05, + "loss": 0.8138, + "step": 12811 + }, + { + "epoch": 3.888905752010927, + "grad_norm": 1.1761176586151123, + "learning_rate": 3.519287232965475e-05, + "loss": 0.9623, + "step": 12812 + }, + { + "epoch": 3.889209288207619, + "grad_norm": 1.1443723440170288, + "learning_rate": 3.518781006378455e-05, + "loss": 1.0365, + "step": 12813 + }, + { + "epoch": 3.88951282440431, + "grad_norm": 1.0047680139541626, + "learning_rate": 3.518274779791435e-05, + "loss": 1.0186, + "step": 12814 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 1.1748378276824951, + "learning_rate": 3.517768553204415e-05, + "loss": 1.1157, + "step": 12815 + }, + { + "epoch": 3.8901198967976933, + "grad_norm": 0.8287310004234314, + "learning_rate": 3.517262326617394e-05, + "loss": 0.7002, + "step": 12816 + }, + { + "epoch": 3.8904234329943845, + "grad_norm": 0.942384660243988, + "learning_rate": 3.516756100030374e-05, + "loss": 0.8588, + "step": 12817 + }, + { + "epoch": 3.890726969191076, + "grad_norm": 1.122603178024292, + "learning_rate": 3.516249873443353e-05, + "loss": 1.0778, + "step": 12818 + }, + { + "epoch": 3.8910305053877674, + "grad_norm": 0.9391096234321594, + "learning_rate": 3.5157436468563334e-05, + "loss": 1.4015, + "step": 12819 + }, + { + "epoch": 3.891334041584459, + "grad_norm": 0.817673921585083, + "learning_rate": 3.515237420269313e-05, + "loss": 1.5834, + "step": 12820 + }, + { + "epoch": 3.8916375777811503, + "grad_norm": 0.9707675576210022, + "learning_rate": 3.5147311936822924e-05, + "loss": 0.8729, + "step": 12821 + }, + { + "epoch": 3.891941113977842, + "grad_norm": 0.863501787185669, + "learning_rate": 3.514224967095272e-05, + "loss": 1.37, + "step": 12822 + }, + { + "epoch": 3.8922446501745336, + "grad_norm": 1.0419350862503052, + "learning_rate": 3.513718740508252e-05, + "loss": 0.7965, + "step": 12823 + }, + { + "epoch": 3.892548186371225, + "grad_norm": 1.0991307497024536, + "learning_rate": 3.5132125139212315e-05, + "loss": 0.9646, + "step": 12824 + }, + { + "epoch": 3.892851722567916, + "grad_norm": 0.9612332582473755, + "learning_rate": 3.512706287334211e-05, + "loss": 1.0259, + "step": 12825 + }, + { + "epoch": 3.8931552587646077, + "grad_norm": 1.2292598485946655, + "learning_rate": 3.5122000607471904e-05, + "loss": 1.0057, + "step": 12826 + }, + { + "epoch": 3.8934587949612993, + "grad_norm": 1.1172291040420532, + "learning_rate": 3.51169383416017e-05, + "loss": 1.0687, + "step": 12827 + }, + { + "epoch": 3.8937623311579905, + "grad_norm": 1.3538439273834229, + "learning_rate": 3.51118760757315e-05, + "loss": 0.9505, + "step": 12828 + }, + { + "epoch": 3.894065867354682, + "grad_norm": 1.1800638437271118, + "learning_rate": 3.5106813809861296e-05, + "loss": 0.7264, + "step": 12829 + }, + { + "epoch": 3.8943694035513734, + "grad_norm": 0.9571306109428406, + "learning_rate": 3.510175154399109e-05, + "loss": 1.3112, + "step": 12830 + }, + { + "epoch": 3.894672939748065, + "grad_norm": 1.2847713232040405, + "learning_rate": 3.5096689278120885e-05, + "loss": 0.8676, + "step": 12831 + }, + { + "epoch": 3.8949764759447563, + "grad_norm": 0.7167972922325134, + "learning_rate": 3.509162701225068e-05, + "loss": 0.7598, + "step": 12832 + }, + { + "epoch": 3.895280012141448, + "grad_norm": 1.161267638206482, + "learning_rate": 3.508656474638048e-05, + "loss": 0.6827, + "step": 12833 + }, + { + "epoch": 3.8955835483381396, + "grad_norm": 0.9653947949409485, + "learning_rate": 3.5081502480510276e-05, + "loss": 1.2617, + "step": 12834 + }, + { + "epoch": 3.895887084534831, + "grad_norm": 1.134700059890747, + "learning_rate": 3.507644021464008e-05, + "loss": 0.7873, + "step": 12835 + }, + { + "epoch": 3.896190620731522, + "grad_norm": 0.8129681944847107, + "learning_rate": 3.507137794876987e-05, + "loss": 0.497, + "step": 12836 + }, + { + "epoch": 3.8964941569282137, + "grad_norm": 1.117619514465332, + "learning_rate": 3.506631568289967e-05, + "loss": 1.0713, + "step": 12837 + }, + { + "epoch": 3.8967976931249053, + "grad_norm": 0.827390730381012, + "learning_rate": 3.506125341702947e-05, + "loss": 1.2354, + "step": 12838 + }, + { + "epoch": 3.8971012293215965, + "grad_norm": 1.0602589845657349, + "learning_rate": 3.5056191151159264e-05, + "loss": 1.0091, + "step": 12839 + }, + { + "epoch": 3.897404765518288, + "grad_norm": 0.9485295414924622, + "learning_rate": 3.505112888528906e-05, + "loss": 1.3034, + "step": 12840 + }, + { + "epoch": 3.8977083017149794, + "grad_norm": 0.9212819933891296, + "learning_rate": 3.5046066619418853e-05, + "loss": 1.1694, + "step": 12841 + }, + { + "epoch": 3.898011837911671, + "grad_norm": 0.9449219107627869, + "learning_rate": 3.504100435354865e-05, + "loss": 1.3696, + "step": 12842 + }, + { + "epoch": 3.8983153741083623, + "grad_norm": 1.1611347198486328, + "learning_rate": 3.503594208767845e-05, + "loss": 0.9341, + "step": 12843 + }, + { + "epoch": 3.898618910305054, + "grad_norm": 0.8138907551765442, + "learning_rate": 3.5030879821808245e-05, + "loss": 1.4067, + "step": 12844 + }, + { + "epoch": 3.8989224465017456, + "grad_norm": 1.1224223375320435, + "learning_rate": 3.502581755593804e-05, + "loss": 0.3305, + "step": 12845 + }, + { + "epoch": 3.899225982698437, + "grad_norm": 1.1098846197128296, + "learning_rate": 3.5020755290067834e-05, + "loss": 0.8761, + "step": 12846 + }, + { + "epoch": 3.899529518895128, + "grad_norm": 0.8564134836196899, + "learning_rate": 3.501569302419763e-05, + "loss": 1.0748, + "step": 12847 + }, + { + "epoch": 3.8998330550918197, + "grad_norm": 1.205400824546814, + "learning_rate": 3.501063075832743e-05, + "loss": 1.0514, + "step": 12848 + }, + { + "epoch": 3.9001365912885113, + "grad_norm": 1.024410367012024, + "learning_rate": 3.5005568492457225e-05, + "loss": 0.6979, + "step": 12849 + }, + { + "epoch": 3.9004401274852025, + "grad_norm": 0.9631844162940979, + "learning_rate": 3.500050622658702e-05, + "loss": 0.7745, + "step": 12850 + }, + { + "epoch": 3.900743663681894, + "grad_norm": 0.823277473449707, + "learning_rate": 3.4995443960716815e-05, + "loss": 0.6338, + "step": 12851 + }, + { + "epoch": 3.9010471998785854, + "grad_norm": 1.1458966732025146, + "learning_rate": 3.4990381694846617e-05, + "loss": 0.8253, + "step": 12852 + }, + { + "epoch": 3.901350736075277, + "grad_norm": 1.0163133144378662, + "learning_rate": 3.498531942897641e-05, + "loss": 0.6213, + "step": 12853 + }, + { + "epoch": 3.9016542722719683, + "grad_norm": 1.088929533958435, + "learning_rate": 3.4980257163106206e-05, + "loss": 1.3314, + "step": 12854 + }, + { + "epoch": 3.90195780846866, + "grad_norm": 1.1183751821517944, + "learning_rate": 3.4975194897236e-05, + "loss": 1.4572, + "step": 12855 + }, + { + "epoch": 3.9022613446653516, + "grad_norm": 1.1112470626831055, + "learning_rate": 3.4970132631365796e-05, + "loss": 1.0209, + "step": 12856 + }, + { + "epoch": 3.902564880862043, + "grad_norm": 0.9992960691452026, + "learning_rate": 3.49650703654956e-05, + "loss": 0.4121, + "step": 12857 + }, + { + "epoch": 3.902868417058734, + "grad_norm": 0.8948344588279724, + "learning_rate": 3.49600080996254e-05, + "loss": 0.8608, + "step": 12858 + }, + { + "epoch": 3.9031719532554257, + "grad_norm": 1.014462947845459, + "learning_rate": 3.4954945833755194e-05, + "loss": 1.1499, + "step": 12859 + }, + { + "epoch": 3.9034754894521173, + "grad_norm": 0.947316586971283, + "learning_rate": 3.494988356788499e-05, + "loss": 0.9601, + "step": 12860 + }, + { + "epoch": 3.9037790256488085, + "grad_norm": 0.9740275144577026, + "learning_rate": 3.494482130201478e-05, + "loss": 1.2666, + "step": 12861 + }, + { + "epoch": 3.9040825618455, + "grad_norm": 0.7803905606269836, + "learning_rate": 3.4939759036144585e-05, + "loss": 1.0437, + "step": 12862 + }, + { + "epoch": 3.9043860980421914, + "grad_norm": 0.8562459945678711, + "learning_rate": 3.493469677027438e-05, + "loss": 0.9262, + "step": 12863 + }, + { + "epoch": 3.904689634238883, + "grad_norm": 1.093837022781372, + "learning_rate": 3.4929634504404174e-05, + "loss": 0.6028, + "step": 12864 + }, + { + "epoch": 3.9049931704355743, + "grad_norm": 1.1971759796142578, + "learning_rate": 3.492457223853397e-05, + "loss": 1.0176, + "step": 12865 + }, + { + "epoch": 3.905296706632266, + "grad_norm": 0.7600323557853699, + "learning_rate": 3.4919509972663764e-05, + "loss": 1.1024, + "step": 12866 + }, + { + "epoch": 3.9056002428289576, + "grad_norm": 0.6052135229110718, + "learning_rate": 3.4914447706793566e-05, + "loss": 1.4025, + "step": 12867 + }, + { + "epoch": 3.905903779025649, + "grad_norm": 0.8249796628952026, + "learning_rate": 3.490938544092336e-05, + "loss": 1.4009, + "step": 12868 + }, + { + "epoch": 3.90620731522234, + "grad_norm": 1.2537859678268433, + "learning_rate": 3.4904323175053155e-05, + "loss": 0.8675, + "step": 12869 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.7942625284194946, + "learning_rate": 3.489926090918295e-05, + "loss": 1.0506, + "step": 12870 + }, + { + "epoch": 3.9068143876157233, + "grad_norm": 0.8161466717720032, + "learning_rate": 3.4894198643312745e-05, + "loss": 0.8116, + "step": 12871 + }, + { + "epoch": 3.9071179238124145, + "grad_norm": 1.1263083219528198, + "learning_rate": 3.4889136377442546e-05, + "loss": 1.0761, + "step": 12872 + }, + { + "epoch": 3.907421460009106, + "grad_norm": 1.1364818811416626, + "learning_rate": 3.488407411157234e-05, + "loss": 0.5963, + "step": 12873 + }, + { + "epoch": 3.9077249962057974, + "grad_norm": 1.0310466289520264, + "learning_rate": 3.4879011845702136e-05, + "loss": 0.6519, + "step": 12874 + }, + { + "epoch": 3.908028532402489, + "grad_norm": 0.8450657725334167, + "learning_rate": 3.487394957983193e-05, + "loss": 0.6331, + "step": 12875 + }, + { + "epoch": 3.9083320685991803, + "grad_norm": 1.055415391921997, + "learning_rate": 3.486888731396173e-05, + "loss": 1.1029, + "step": 12876 + }, + { + "epoch": 3.908635604795872, + "grad_norm": 1.3890700340270996, + "learning_rate": 3.486382504809153e-05, + "loss": 0.8733, + "step": 12877 + }, + { + "epoch": 3.9089391409925636, + "grad_norm": 1.1576521396636963, + "learning_rate": 3.485876278222132e-05, + "loss": 0.9503, + "step": 12878 + }, + { + "epoch": 3.9092426771892548, + "grad_norm": 0.9269441366195679, + "learning_rate": 3.485370051635112e-05, + "loss": 0.4886, + "step": 12879 + }, + { + "epoch": 3.9095462133859464, + "grad_norm": 1.0863484144210815, + "learning_rate": 3.484863825048092e-05, + "loss": 0.6022, + "step": 12880 + }, + { + "epoch": 3.9098497495826376, + "grad_norm": 0.9086548686027527, + "learning_rate": 3.484357598461071e-05, + "loss": 0.9302, + "step": 12881 + }, + { + "epoch": 3.9101532857793293, + "grad_norm": 0.8593830466270447, + "learning_rate": 3.4838513718740515e-05, + "loss": 1.4326, + "step": 12882 + }, + { + "epoch": 3.9104568219760205, + "grad_norm": 1.2345762252807617, + "learning_rate": 3.483345145287031e-05, + "loss": 0.921, + "step": 12883 + }, + { + "epoch": 3.910760358172712, + "grad_norm": 1.079460620880127, + "learning_rate": 3.4828389187000104e-05, + "loss": 1.1523, + "step": 12884 + }, + { + "epoch": 3.911063894369404, + "grad_norm": 0.82794189453125, + "learning_rate": 3.48233269211299e-05, + "loss": 0.9889, + "step": 12885 + }, + { + "epoch": 3.911367430566095, + "grad_norm": 0.5988240838050842, + "learning_rate": 3.4818264655259694e-05, + "loss": 1.2475, + "step": 12886 + }, + { + "epoch": 3.9116709667627863, + "grad_norm": 1.1055989265441895, + "learning_rate": 3.4813202389389495e-05, + "loss": 0.568, + "step": 12887 + }, + { + "epoch": 3.911974502959478, + "grad_norm": 1.1823478937149048, + "learning_rate": 3.480814012351929e-05, + "loss": 0.9621, + "step": 12888 + }, + { + "epoch": 3.9122780391561696, + "grad_norm": 1.0848126411437988, + "learning_rate": 3.4803077857649085e-05, + "loss": 1.1488, + "step": 12889 + }, + { + "epoch": 3.9125815753528608, + "grad_norm": 0.7492715120315552, + "learning_rate": 3.479801559177888e-05, + "loss": 0.5606, + "step": 12890 + }, + { + "epoch": 3.9128851115495524, + "grad_norm": 0.9915061593055725, + "learning_rate": 3.479295332590868e-05, + "loss": 1.1509, + "step": 12891 + }, + { + "epoch": 3.9131886477462436, + "grad_norm": 1.320170521736145, + "learning_rate": 3.4787891060038476e-05, + "loss": 0.7609, + "step": 12892 + }, + { + "epoch": 3.9134921839429353, + "grad_norm": 0.9450749754905701, + "learning_rate": 3.478282879416827e-05, + "loss": 1.6667, + "step": 12893 + }, + { + "epoch": 3.9137957201396265, + "grad_norm": 1.105170488357544, + "learning_rate": 3.4777766528298066e-05, + "loss": 0.6717, + "step": 12894 + }, + { + "epoch": 3.914099256336318, + "grad_norm": 0.9514955282211304, + "learning_rate": 3.477270426242786e-05, + "loss": 1.0965, + "step": 12895 + }, + { + "epoch": 3.91440279253301, + "grad_norm": 0.9917048215866089, + "learning_rate": 3.476764199655766e-05, + "loss": 1.1753, + "step": 12896 + }, + { + "epoch": 3.914706328729701, + "grad_norm": 0.9203944802284241, + "learning_rate": 3.476257973068746e-05, + "loss": 1.3787, + "step": 12897 + }, + { + "epoch": 3.9150098649263922, + "grad_norm": 1.3331972360610962, + "learning_rate": 3.475751746481725e-05, + "loss": 0.8704, + "step": 12898 + }, + { + "epoch": 3.915313401123084, + "grad_norm": 1.0767219066619873, + "learning_rate": 3.4752455198947046e-05, + "loss": 0.5112, + "step": 12899 + }, + { + "epoch": 3.9156169373197756, + "grad_norm": 0.8247134685516357, + "learning_rate": 3.474739293307684e-05, + "loss": 0.5195, + "step": 12900 + }, + { + "epoch": 3.9159204735164668, + "grad_norm": 1.037001609802246, + "learning_rate": 3.474233066720664e-05, + "loss": 0.6044, + "step": 12901 + }, + { + "epoch": 3.9162240097131584, + "grad_norm": 0.8611426949501038, + "learning_rate": 3.4737268401336444e-05, + "loss": 0.9853, + "step": 12902 + }, + { + "epoch": 3.9165275459098496, + "grad_norm": 0.8114489316940308, + "learning_rate": 3.473220613546624e-05, + "loss": 1.1405, + "step": 12903 + }, + { + "epoch": 3.9168310821065413, + "grad_norm": 1.0356072187423706, + "learning_rate": 3.4727143869596034e-05, + "loss": 0.8928, + "step": 12904 + }, + { + "epoch": 3.9171346183032325, + "grad_norm": 0.9236775636672974, + "learning_rate": 3.472208160372583e-05, + "loss": 0.6081, + "step": 12905 + }, + { + "epoch": 3.917438154499924, + "grad_norm": 1.2136523723602295, + "learning_rate": 3.471701933785563e-05, + "loss": 0.9908, + "step": 12906 + }, + { + "epoch": 3.917741690696616, + "grad_norm": 1.0099775791168213, + "learning_rate": 3.4711957071985425e-05, + "loss": 0.6626, + "step": 12907 + }, + { + "epoch": 3.918045226893307, + "grad_norm": 0.8759127855300903, + "learning_rate": 3.470689480611522e-05, + "loss": 0.7138, + "step": 12908 + }, + { + "epoch": 3.9183487630899982, + "grad_norm": 1.0620791912078857, + "learning_rate": 3.4701832540245015e-05, + "loss": 1.1003, + "step": 12909 + }, + { + "epoch": 3.91865229928669, + "grad_norm": 0.9384909868240356, + "learning_rate": 3.469677027437481e-05, + "loss": 0.8244, + "step": 12910 + }, + { + "epoch": 3.9189558354833816, + "grad_norm": 1.116242527961731, + "learning_rate": 3.469170800850461e-05, + "loss": 0.8404, + "step": 12911 + }, + { + "epoch": 3.9192593716800728, + "grad_norm": 1.1098620891571045, + "learning_rate": 3.4686645742634406e-05, + "loss": 0.5527, + "step": 12912 + }, + { + "epoch": 3.9195629078767644, + "grad_norm": 1.0352057218551636, + "learning_rate": 3.46815834767642e-05, + "loss": 1.0489, + "step": 12913 + }, + { + "epoch": 3.9198664440734556, + "grad_norm": 0.9355011582374573, + "learning_rate": 3.4676521210893995e-05, + "loss": 1.1283, + "step": 12914 + }, + { + "epoch": 3.9201699802701473, + "grad_norm": 0.6213399767875671, + "learning_rate": 3.46714589450238e-05, + "loss": 0.5443, + "step": 12915 + }, + { + "epoch": 3.9204735164668385, + "grad_norm": 1.0223432779312134, + "learning_rate": 3.466639667915359e-05, + "loss": 1.1667, + "step": 12916 + }, + { + "epoch": 3.92077705266353, + "grad_norm": 0.5702821612358093, + "learning_rate": 3.4661334413283387e-05, + "loss": 0.8732, + "step": 12917 + }, + { + "epoch": 3.921080588860222, + "grad_norm": 1.086014747619629, + "learning_rate": 3.465627214741318e-05, + "loss": 1.2579, + "step": 12918 + }, + { + "epoch": 3.921384125056913, + "grad_norm": 0.9580252170562744, + "learning_rate": 3.4651209881542976e-05, + "loss": 0.9193, + "step": 12919 + }, + { + "epoch": 3.9216876612536042, + "grad_norm": 1.2352651357650757, + "learning_rate": 3.464614761567278e-05, + "loss": 0.9098, + "step": 12920 + }, + { + "epoch": 3.921991197450296, + "grad_norm": 0.8585063219070435, + "learning_rate": 3.464108534980257e-05, + "loss": 0.8995, + "step": 12921 + }, + { + "epoch": 3.9222947336469876, + "grad_norm": 0.7661798596382141, + "learning_rate": 3.463602308393237e-05, + "loss": 0.6158, + "step": 12922 + }, + { + "epoch": 3.9225982698436788, + "grad_norm": 1.2403442859649658, + "learning_rate": 3.463096081806216e-05, + "loss": 0.8254, + "step": 12923 + }, + { + "epoch": 3.9229018060403704, + "grad_norm": 1.1763293743133545, + "learning_rate": 3.4625898552191964e-05, + "loss": 0.8035, + "step": 12924 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 1.222867488861084, + "learning_rate": 3.462083628632176e-05, + "loss": 1.0514, + "step": 12925 + }, + { + "epoch": 3.9235088784337533, + "grad_norm": 1.016494631767273, + "learning_rate": 3.461577402045156e-05, + "loss": 0.9438, + "step": 12926 + }, + { + "epoch": 3.9238124146304445, + "grad_norm": 0.9142356514930725, + "learning_rate": 3.4610711754581355e-05, + "loss": 1.1086, + "step": 12927 + }, + { + "epoch": 3.924115950827136, + "grad_norm": 0.7526003122329712, + "learning_rate": 3.460564948871115e-05, + "loss": 0.5114, + "step": 12928 + }, + { + "epoch": 3.924419487023828, + "grad_norm": 1.0274808406829834, + "learning_rate": 3.4600587222840944e-05, + "loss": 1.1915, + "step": 12929 + }, + { + "epoch": 3.924723023220519, + "grad_norm": 1.0000874996185303, + "learning_rate": 3.4595524956970746e-05, + "loss": 0.8233, + "step": 12930 + }, + { + "epoch": 3.9250265594172102, + "grad_norm": 0.9876391887664795, + "learning_rate": 3.459046269110054e-05, + "loss": 0.9037, + "step": 12931 + }, + { + "epoch": 3.925330095613902, + "grad_norm": 0.8622162342071533, + "learning_rate": 3.4585400425230336e-05, + "loss": 0.7963, + "step": 12932 + }, + { + "epoch": 3.9256336318105935, + "grad_norm": 0.8594898581504822, + "learning_rate": 3.458033815936013e-05, + "loss": 0.549, + "step": 12933 + }, + { + "epoch": 3.9259371680072848, + "grad_norm": 1.1334692239761353, + "learning_rate": 3.4575275893489925e-05, + "loss": 0.8462, + "step": 12934 + }, + { + "epoch": 3.9262407042039764, + "grad_norm": 0.9069984555244446, + "learning_rate": 3.457021362761973e-05, + "loss": 1.152, + "step": 12935 + }, + { + "epoch": 3.9265442404006676, + "grad_norm": 1.1273424625396729, + "learning_rate": 3.456515136174952e-05, + "loss": 0.9965, + "step": 12936 + }, + { + "epoch": 3.9268477765973593, + "grad_norm": 1.0060861110687256, + "learning_rate": 3.4560089095879316e-05, + "loss": 1.0058, + "step": 12937 + }, + { + "epoch": 3.9271513127940505, + "grad_norm": 1.1516422033309937, + "learning_rate": 3.455502683000911e-05, + "loss": 0.7691, + "step": 12938 + }, + { + "epoch": 3.927454848990742, + "grad_norm": 0.9832441210746765, + "learning_rate": 3.4549964564138906e-05, + "loss": 0.666, + "step": 12939 + }, + { + "epoch": 3.927758385187434, + "grad_norm": 0.8818472623825073, + "learning_rate": 3.454490229826871e-05, + "loss": 1.2367, + "step": 12940 + }, + { + "epoch": 3.928061921384125, + "grad_norm": 1.1702052354812622, + "learning_rate": 3.45398400323985e-05, + "loss": 0.515, + "step": 12941 + }, + { + "epoch": 3.9283654575808167, + "grad_norm": 0.8900005221366882, + "learning_rate": 3.45347777665283e-05, + "loss": 0.7948, + "step": 12942 + }, + { + "epoch": 3.928668993777508, + "grad_norm": 1.1836581230163574, + "learning_rate": 3.452971550065809e-05, + "loss": 0.8335, + "step": 12943 + }, + { + "epoch": 3.9289725299741995, + "grad_norm": 0.9398072361946106, + "learning_rate": 3.4524653234787893e-05, + "loss": 0.417, + "step": 12944 + }, + { + "epoch": 3.9292760661708908, + "grad_norm": 1.1093302965164185, + "learning_rate": 3.451959096891769e-05, + "loss": 1.0922, + "step": 12945 + }, + { + "epoch": 3.9295796023675824, + "grad_norm": 1.0267990827560425, + "learning_rate": 3.451452870304749e-05, + "loss": 1.1161, + "step": 12946 + }, + { + "epoch": 3.9298831385642736, + "grad_norm": 1.2737693786621094, + "learning_rate": 3.4509466437177285e-05, + "loss": 0.7559, + "step": 12947 + }, + { + "epoch": 3.9301866747609653, + "grad_norm": 1.0777260065078735, + "learning_rate": 3.450440417130708e-05, + "loss": 1.2519, + "step": 12948 + }, + { + "epoch": 3.9304902109576565, + "grad_norm": 1.2004776000976562, + "learning_rate": 3.4499341905436874e-05, + "loss": 1.0826, + "step": 12949 + }, + { + "epoch": 3.930793747154348, + "grad_norm": 1.3835471868515015, + "learning_rate": 3.4494279639566676e-05, + "loss": 1.0255, + "step": 12950 + }, + { + "epoch": 3.93109728335104, + "grad_norm": 0.9945926070213318, + "learning_rate": 3.448921737369647e-05, + "loss": 0.3174, + "step": 12951 + }, + { + "epoch": 3.931400819547731, + "grad_norm": 1.0720852613449097, + "learning_rate": 3.4484155107826265e-05, + "loss": 0.9559, + "step": 12952 + }, + { + "epoch": 3.9317043557444227, + "grad_norm": 1.2544095516204834, + "learning_rate": 3.447909284195606e-05, + "loss": 0.7755, + "step": 12953 + }, + { + "epoch": 3.932007891941114, + "grad_norm": 1.2691837549209595, + "learning_rate": 3.447403057608586e-05, + "loss": 1.1166, + "step": 12954 + }, + { + "epoch": 3.9323114281378055, + "grad_norm": 1.0247416496276855, + "learning_rate": 3.4468968310215657e-05, + "loss": 0.6483, + "step": 12955 + }, + { + "epoch": 3.9326149643344968, + "grad_norm": 0.6031266450881958, + "learning_rate": 3.446390604434545e-05, + "loss": 1.5393, + "step": 12956 + }, + { + "epoch": 3.9329185005311884, + "grad_norm": 0.849032998085022, + "learning_rate": 3.4458843778475246e-05, + "loss": 0.8804, + "step": 12957 + }, + { + "epoch": 3.93322203672788, + "grad_norm": 0.9742588996887207, + "learning_rate": 3.445378151260504e-05, + "loss": 0.7288, + "step": 12958 + }, + { + "epoch": 3.9335255729245713, + "grad_norm": 0.9747579097747803, + "learning_rate": 3.444871924673484e-05, + "loss": 1.1304, + "step": 12959 + }, + { + "epoch": 3.9338291091212625, + "grad_norm": 1.229857325553894, + "learning_rate": 3.444365698086464e-05, + "loss": 1.1285, + "step": 12960 + }, + { + "epoch": 3.934132645317954, + "grad_norm": 1.0741032361984253, + "learning_rate": 3.443859471499443e-05, + "loss": 0.9812, + "step": 12961 + }, + { + "epoch": 3.934436181514646, + "grad_norm": 1.0816117525100708, + "learning_rate": 3.443353244912423e-05, + "loss": 0.9275, + "step": 12962 + }, + { + "epoch": 3.934739717711337, + "grad_norm": 1.2288004159927368, + "learning_rate": 3.442847018325402e-05, + "loss": 0.8838, + "step": 12963 + }, + { + "epoch": 3.9350432539080287, + "grad_norm": 1.1112147569656372, + "learning_rate": 3.442340791738382e-05, + "loss": 1.0218, + "step": 12964 + }, + { + "epoch": 3.93534679010472, + "grad_norm": 0.7350972294807434, + "learning_rate": 3.441834565151362e-05, + "loss": 1.5615, + "step": 12965 + }, + { + "epoch": 3.9356503263014115, + "grad_norm": 0.7265928387641907, + "learning_rate": 3.441328338564341e-05, + "loss": 1.3873, + "step": 12966 + }, + { + "epoch": 3.9359538624981028, + "grad_norm": 0.9279378056526184, + "learning_rate": 3.440822111977321e-05, + "loss": 1.1718, + "step": 12967 + }, + { + "epoch": 3.9362573986947944, + "grad_norm": 0.9645172953605652, + "learning_rate": 3.440315885390301e-05, + "loss": 1.4764, + "step": 12968 + }, + { + "epoch": 3.936560934891486, + "grad_norm": 1.0865131616592407, + "learning_rate": 3.439809658803281e-05, + "loss": 1.1067, + "step": 12969 + }, + { + "epoch": 3.9368644710881773, + "grad_norm": 1.1409804821014404, + "learning_rate": 3.4393034322162606e-05, + "loss": 0.7107, + "step": 12970 + }, + { + "epoch": 3.9371680072848685, + "grad_norm": 0.9799801707267761, + "learning_rate": 3.43879720562924e-05, + "loss": 0.7769, + "step": 12971 + }, + { + "epoch": 3.93747154348156, + "grad_norm": 0.949400007724762, + "learning_rate": 3.4382909790422195e-05, + "loss": 1.0955, + "step": 12972 + }, + { + "epoch": 3.937775079678252, + "grad_norm": 0.988425612449646, + "learning_rate": 3.437784752455199e-05, + "loss": 1.2014, + "step": 12973 + }, + { + "epoch": 3.938078615874943, + "grad_norm": 0.6010022759437561, + "learning_rate": 3.437278525868179e-05, + "loss": 0.8656, + "step": 12974 + }, + { + "epoch": 3.9383821520716347, + "grad_norm": 1.0673894882202148, + "learning_rate": 3.4367722992811586e-05, + "loss": 0.9675, + "step": 12975 + }, + { + "epoch": 3.938685688268326, + "grad_norm": 1.2702884674072266, + "learning_rate": 3.436266072694138e-05, + "loss": 0.632, + "step": 12976 + }, + { + "epoch": 3.9389892244650175, + "grad_norm": 1.3548012971878052, + "learning_rate": 3.4357598461071176e-05, + "loss": 0.6125, + "step": 12977 + }, + { + "epoch": 3.9392927606617087, + "grad_norm": 1.178971529006958, + "learning_rate": 3.435253619520097e-05, + "loss": 0.6941, + "step": 12978 + }, + { + "epoch": 3.9395962968584004, + "grad_norm": 0.9840765595436096, + "learning_rate": 3.434747392933077e-05, + "loss": 1.0074, + "step": 12979 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.8956827521324158, + "learning_rate": 3.434241166346057e-05, + "loss": 0.9996, + "step": 12980 + }, + { + "epoch": 3.9402033692517833, + "grad_norm": 1.0694526433944702, + "learning_rate": 3.433734939759036e-05, + "loss": 1.0292, + "step": 12981 + }, + { + "epoch": 3.9405069054484745, + "grad_norm": 0.9277745485305786, + "learning_rate": 3.433228713172016e-05, + "loss": 1.1204, + "step": 12982 + }, + { + "epoch": 3.940810441645166, + "grad_norm": 0.7555796504020691, + "learning_rate": 3.432722486584996e-05, + "loss": 1.1315, + "step": 12983 + }, + { + "epoch": 3.941113977841858, + "grad_norm": 1.0947555303573608, + "learning_rate": 3.432216259997975e-05, + "loss": 0.9816, + "step": 12984 + }, + { + "epoch": 3.941417514038549, + "grad_norm": 1.2224199771881104, + "learning_rate": 3.431710033410955e-05, + "loss": 0.7874, + "step": 12985 + }, + { + "epoch": 3.9417210502352407, + "grad_norm": 1.3432127237319946, + "learning_rate": 3.431203806823934e-05, + "loss": 0.7881, + "step": 12986 + }, + { + "epoch": 3.942024586431932, + "grad_norm": 1.0046565532684326, + "learning_rate": 3.430697580236914e-05, + "loss": 0.436, + "step": 12987 + }, + { + "epoch": 3.9423281226286235, + "grad_norm": 0.9844894409179688, + "learning_rate": 3.430191353649894e-05, + "loss": 1.0126, + "step": 12988 + }, + { + "epoch": 3.9426316588253147, + "grad_norm": 1.016448736190796, + "learning_rate": 3.4296851270628734e-05, + "loss": 1.2571, + "step": 12989 + }, + { + "epoch": 3.9429351950220064, + "grad_norm": 1.2621372938156128, + "learning_rate": 3.429178900475853e-05, + "loss": 0.9672, + "step": 12990 + }, + { + "epoch": 3.943238731218698, + "grad_norm": 1.060259461402893, + "learning_rate": 3.428672673888833e-05, + "loss": 0.9932, + "step": 12991 + }, + { + "epoch": 3.9435422674153893, + "grad_norm": 1.1708661317825317, + "learning_rate": 3.4281664473018125e-05, + "loss": 0.8806, + "step": 12992 + }, + { + "epoch": 3.9438458036120805, + "grad_norm": 1.1283173561096191, + "learning_rate": 3.4276602207147927e-05, + "loss": 0.944, + "step": 12993 + }, + { + "epoch": 3.944149339808772, + "grad_norm": 1.1456162929534912, + "learning_rate": 3.427153994127772e-05, + "loss": 1.1183, + "step": 12994 + }, + { + "epoch": 3.944452876005464, + "grad_norm": 1.172694206237793, + "learning_rate": 3.4266477675407516e-05, + "loss": 0.8303, + "step": 12995 + }, + { + "epoch": 3.944756412202155, + "grad_norm": 0.9476397037506104, + "learning_rate": 3.426141540953731e-05, + "loss": 0.7942, + "step": 12996 + }, + { + "epoch": 3.9450599483988467, + "grad_norm": 0.9647150635719299, + "learning_rate": 3.4256353143667106e-05, + "loss": 0.8919, + "step": 12997 + }, + { + "epoch": 3.945363484595538, + "grad_norm": 0.9750615358352661, + "learning_rate": 3.425129087779691e-05, + "loss": 0.9493, + "step": 12998 + }, + { + "epoch": 3.9456670207922295, + "grad_norm": 0.7674238085746765, + "learning_rate": 3.42462286119267e-05, + "loss": 0.7358, + "step": 12999 + }, + { + "epoch": 3.9459705569889207, + "grad_norm": 0.9993025064468384, + "learning_rate": 3.42411663460565e-05, + "loss": 0.9647, + "step": 13000 + } + ], + "logging_steps": 1, + "max_steps": 19764, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.42420147779036e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}