diff --git "a/checkpoint-2729/trainer_state.json" "b/checkpoint-2729/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2729/trainer_state.json" @@ -0,0 +1,19168 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 683, + "global_step": 2729, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036643459142543056, + "grad_norm": 0.46817973256111145, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.2732, + "step": 1 + }, + { + "epoch": 0.00036643459142543056, + "eval_loss": 1.0894134044647217, + "eval_runtime": 793.7236, + "eval_samples_per_second": 3.417, + "eval_steps_per_second": 0.427, + "step": 1 + }, + { + "epoch": 0.0007328691828508611, + "grad_norm": 0.47947484254837036, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3138, + "step": 2 + }, + { + "epoch": 0.0010993037742762918, + "grad_norm": 0.49031010270118713, + "learning_rate": 6e-06, + "loss": 1.3379, + "step": 3 + }, + { + "epoch": 0.0014657383657017222, + "grad_norm": 0.48090797662734985, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2044, + "step": 4 + }, + { + "epoch": 0.001832172957127153, + "grad_norm": 0.4987533390522003, + "learning_rate": 1e-05, + "loss": 1.3141, + "step": 5 + }, + { + "epoch": 0.0021986075485525836, + "grad_norm": 0.500445544719696, + "learning_rate": 1.2e-05, + "loss": 1.3083, + "step": 6 + }, + { + "epoch": 0.002565042139978014, + "grad_norm": 0.44339627027511597, + "learning_rate": 1.4e-05, + "loss": 1.2854, + "step": 7 + }, + { + "epoch": 0.0029314767314034445, + "grad_norm": 0.48581796884536743, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1958, + "step": 8 + }, + { + "epoch": 0.003297911322828875, + "grad_norm": 0.4820937514305115, + "learning_rate": 1.8e-05, + "loss": 1.3483, + "step": 9 + }, + { + "epoch": 0.003664345914254306, + "grad_norm": 0.44360584020614624, + "learning_rate": 2e-05, + "loss": 1.3793, + "step": 10 + }, + { + "epoch": 0.004030780505679736, + "grad_norm": 0.4145188331604004, + "learning_rate": 1.999999332499975e-05, + "loss": 0.8327, + "step": 11 + }, + { + "epoch": 0.004397215097105167, + "grad_norm": 0.3894279897212982, + "learning_rate": 1.9999973300007917e-05, + "loss": 1.2966, + "step": 12 + }, + { + "epoch": 0.004763649688530597, + "grad_norm": 0.38675037026405334, + "learning_rate": 1.999993992505123e-05, + "loss": 1.2477, + "step": 13 + }, + { + "epoch": 0.005130084279956028, + "grad_norm": 0.37546542286872864, + "learning_rate": 1.999989320017424e-05, + "loss": 1.2395, + "step": 14 + }, + { + "epoch": 0.005496518871381458, + "grad_norm": 0.3815326988697052, + "learning_rate": 1.9999833125439333e-05, + "loss": 1.2152, + "step": 15 + }, + { + "epoch": 0.005862953462806889, + "grad_norm": 0.4222187399864197, + "learning_rate": 1.9999759700926705e-05, + "loss": 1.3211, + "step": 16 + }, + { + "epoch": 0.00622938805423232, + "grad_norm": 0.3901718556880951, + "learning_rate": 1.9999672926734378e-05, + "loss": 1.3281, + "step": 17 + }, + { + "epoch": 0.00659582264565775, + "grad_norm": 0.4249936044216156, + "learning_rate": 1.99995728029782e-05, + "loss": 1.1635, + "step": 18 + }, + { + "epoch": 0.006962257237083181, + "grad_norm": 0.3950119912624359, + "learning_rate": 1.999945932979183e-05, + "loss": 1.2472, + "step": 19 + }, + { + "epoch": 0.007328691828508612, + "grad_norm": 0.47602978348731995, + "learning_rate": 1.9999332507326753e-05, + "loss": 1.2014, + "step": 20 + }, + { + "epoch": 0.007695126419934042, + "grad_norm": 0.43517524003982544, + "learning_rate": 1.9999192335752284e-05, + "loss": 1.2921, + "step": 21 + }, + { + "epoch": 0.008061561011359472, + "grad_norm": 0.41316503286361694, + "learning_rate": 1.999903881525555e-05, + "loss": 1.2258, + "step": 22 + }, + { + "epoch": 0.008427995602784902, + "grad_norm": 0.4038918912410736, + "learning_rate": 1.9998871946041497e-05, + "loss": 1.2234, + "step": 23 + }, + { + "epoch": 0.008794430194210334, + "grad_norm": 0.43499597907066345, + "learning_rate": 1.99986917283329e-05, + "loss": 1.2151, + "step": 24 + }, + { + "epoch": 0.009160864785635764, + "grad_norm": 0.4008733928203583, + "learning_rate": 1.9998498162370347e-05, + "loss": 1.1845, + "step": 25 + }, + { + "epoch": 0.009527299377061194, + "grad_norm": 0.42672181129455566, + "learning_rate": 1.9998291248412253e-05, + "loss": 1.111, + "step": 26 + }, + { + "epoch": 0.009893733968486624, + "grad_norm": 0.39955198764801025, + "learning_rate": 1.999807098673484e-05, + "loss": 1.2242, + "step": 27 + }, + { + "epoch": 0.010260168559912056, + "grad_norm": 0.4173682928085327, + "learning_rate": 1.999783737763216e-05, + "loss": 1.1515, + "step": 28 + }, + { + "epoch": 0.010626603151337486, + "grad_norm": 0.4032392203807831, + "learning_rate": 1.999759042141609e-05, + "loss": 1.2194, + "step": 29 + }, + { + "epoch": 0.010993037742762916, + "grad_norm": 0.391091525554657, + "learning_rate": 1.9997330118416305e-05, + "loss": 1.204, + "step": 30 + }, + { + "epoch": 0.011359472334188348, + "grad_norm": 0.3903694152832031, + "learning_rate": 1.9997056468980315e-05, + "loss": 1.2606, + "step": 31 + }, + { + "epoch": 0.011725906925613778, + "grad_norm": 0.3887855112552643, + "learning_rate": 1.999676947347344e-05, + "loss": 1.2385, + "step": 32 + }, + { + "epoch": 0.012092341517039208, + "grad_norm": 0.4681757390499115, + "learning_rate": 1.999646913227882e-05, + "loss": 1.1089, + "step": 33 + }, + { + "epoch": 0.01245877610846464, + "grad_norm": 0.42056483030319214, + "learning_rate": 1.9996155445797415e-05, + "loss": 1.1911, + "step": 34 + }, + { + "epoch": 0.01282521069989007, + "grad_norm": 0.4387955665588379, + "learning_rate": 1.999582841444799e-05, + "loss": 1.2162, + "step": 35 + }, + { + "epoch": 0.0131916452913155, + "grad_norm": 0.4079417884349823, + "learning_rate": 1.999548803866713e-05, + "loss": 1.1769, + "step": 36 + }, + { + "epoch": 0.013558079882740931, + "grad_norm": 0.48628127574920654, + "learning_rate": 1.9995134318909247e-05, + "loss": 1.0853, + "step": 37 + }, + { + "epoch": 0.013924514474166361, + "grad_norm": 0.5449658632278442, + "learning_rate": 1.9994767255646548e-05, + "loss": 1.0893, + "step": 38 + }, + { + "epoch": 0.014290949065591791, + "grad_norm": 0.4337720274925232, + "learning_rate": 1.9994386849369062e-05, + "loss": 1.2324, + "step": 39 + }, + { + "epoch": 0.014657383657017223, + "grad_norm": 0.4274596571922302, + "learning_rate": 1.999399310058464e-05, + "loss": 1.0204, + "step": 40 + }, + { + "epoch": 0.015023818248442653, + "grad_norm": 0.420960009098053, + "learning_rate": 1.9993586009818927e-05, + "loss": 1.1547, + "step": 41 + }, + { + "epoch": 0.015390252839868083, + "grad_norm": 0.42221662402153015, + "learning_rate": 1.9993165577615393e-05, + "loss": 1.1781, + "step": 42 + }, + { + "epoch": 0.015756687431293513, + "grad_norm": 0.4561188220977783, + "learning_rate": 1.999273180453532e-05, + "loss": 1.2284, + "step": 43 + }, + { + "epoch": 0.016123122022718945, + "grad_norm": 0.43053948879241943, + "learning_rate": 1.9992284691157785e-05, + "loss": 1.1378, + "step": 44 + }, + { + "epoch": 0.016489556614144377, + "grad_norm": 0.480121374130249, + "learning_rate": 1.999182423807969e-05, + "loss": 1.2033, + "step": 45 + }, + { + "epoch": 0.016855991205569805, + "grad_norm": 0.4473513960838318, + "learning_rate": 1.9991350445915745e-05, + "loss": 1.1538, + "step": 46 + }, + { + "epoch": 0.017222425796995237, + "grad_norm": 0.5381205081939697, + "learning_rate": 1.999086331529845e-05, + "loss": 1.1332, + "step": 47 + }, + { + "epoch": 0.01758886038842067, + "grad_norm": 0.4490923583507538, + "learning_rate": 1.999036284687814e-05, + "loss": 1.1733, + "step": 48 + }, + { + "epoch": 0.017955294979846097, + "grad_norm": 0.44338756799697876, + "learning_rate": 1.998984904132293e-05, + "loss": 1.1781, + "step": 49 + }, + { + "epoch": 0.01832172957127153, + "grad_norm": 0.5198048949241638, + "learning_rate": 1.998932189931875e-05, + "loss": 1.1102, + "step": 50 + }, + { + "epoch": 0.01868816416269696, + "grad_norm": 0.6394063234329224, + "learning_rate": 1.9988781421569333e-05, + "loss": 1.0399, + "step": 51 + }, + { + "epoch": 0.01905459875412239, + "grad_norm": 0.5123482942581177, + "learning_rate": 1.9988227608796227e-05, + "loss": 1.145, + "step": 52 + }, + { + "epoch": 0.01942103334554782, + "grad_norm": 0.5353620052337646, + "learning_rate": 1.9987660461738763e-05, + "loss": 1.0634, + "step": 53 + }, + { + "epoch": 0.01978746793697325, + "grad_norm": 0.49093300104141235, + "learning_rate": 1.998707998115409e-05, + "loss": 1.1711, + "step": 54 + }, + { + "epoch": 0.02015390252839868, + "grad_norm": 0.4911767542362213, + "learning_rate": 1.9986486167817136e-05, + "loss": 1.1343, + "step": 55 + }, + { + "epoch": 0.020520337119824112, + "grad_norm": 0.5678600072860718, + "learning_rate": 1.998587902252066e-05, + "loss": 1.1299, + "step": 56 + }, + { + "epoch": 0.02088677171124954, + "grad_norm": 0.5117340683937073, + "learning_rate": 1.9985258546075187e-05, + "loss": 1.1147, + "step": 57 + }, + { + "epoch": 0.021253206302674972, + "grad_norm": 0.5707929730415344, + "learning_rate": 1.9984624739309056e-05, + "loss": 1.1359, + "step": 58 + }, + { + "epoch": 0.021619640894100404, + "grad_norm": 0.539131224155426, + "learning_rate": 1.9983977603068404e-05, + "loss": 1.1615, + "step": 59 + }, + { + "epoch": 0.021986075485525832, + "grad_norm": 0.5231873393058777, + "learning_rate": 1.9983317138217155e-05, + "loss": 1.1158, + "step": 60 + }, + { + "epoch": 0.022352510076951264, + "grad_norm": 0.5066128373146057, + "learning_rate": 1.9982643345637027e-05, + "loss": 1.1435, + "step": 61 + }, + { + "epoch": 0.022718944668376696, + "grad_norm": 0.6198028922080994, + "learning_rate": 1.9981956226227537e-05, + "loss": 1.056, + "step": 62 + }, + { + "epoch": 0.023085379259802124, + "grad_norm": 0.5626301765441895, + "learning_rate": 1.9981255780905988e-05, + "loss": 1.1502, + "step": 63 + }, + { + "epoch": 0.023451813851227556, + "grad_norm": 0.6188763380050659, + "learning_rate": 1.9980542010607476e-05, + "loss": 1.0754, + "step": 64 + }, + { + "epoch": 0.023818248442652987, + "grad_norm": 0.6580602526664734, + "learning_rate": 1.997981491628488e-05, + "loss": 1.1088, + "step": 65 + }, + { + "epoch": 0.024184683034078416, + "grad_norm": 0.7106578350067139, + "learning_rate": 1.9979074498908876e-05, + "loss": 1.0783, + "step": 66 + }, + { + "epoch": 0.024551117625503847, + "grad_norm": 0.5700773596763611, + "learning_rate": 1.997832075946792e-05, + "loss": 1.0747, + "step": 67 + }, + { + "epoch": 0.02491755221692928, + "grad_norm": 0.6465053558349609, + "learning_rate": 1.9977553698968252e-05, + "loss": 1.1031, + "step": 68 + }, + { + "epoch": 0.025283986808354707, + "grad_norm": 0.5877029299736023, + "learning_rate": 1.9976773318433896e-05, + "loss": 1.1117, + "step": 69 + }, + { + "epoch": 0.02565042139978014, + "grad_norm": 0.6212673187255859, + "learning_rate": 1.9975979618906665e-05, + "loss": 1.0253, + "step": 70 + }, + { + "epoch": 0.02601685599120557, + "grad_norm": 0.6170660853385925, + "learning_rate": 1.9975172601446147e-05, + "loss": 0.9862, + "step": 71 + }, + { + "epoch": 0.026383290582631, + "grad_norm": 0.6177666783332825, + "learning_rate": 1.997435226712971e-05, + "loss": 1.0611, + "step": 72 + }, + { + "epoch": 0.02674972517405643, + "grad_norm": 0.6978894472122192, + "learning_rate": 1.99735186170525e-05, + "loss": 1.0911, + "step": 73 + }, + { + "epoch": 0.027116159765481863, + "grad_norm": 0.635191023349762, + "learning_rate": 1.997267165232744e-05, + "loss": 1.1159, + "step": 74 + }, + { + "epoch": 0.02748259435690729, + "grad_norm": 0.567520260810852, + "learning_rate": 1.9971811374085228e-05, + "loss": 1.113, + "step": 75 + }, + { + "epoch": 0.027849028948332723, + "grad_norm": 0.5939634442329407, + "learning_rate": 1.9970937783474333e-05, + "loss": 1.1586, + "step": 76 + }, + { + "epoch": 0.028215463539758154, + "grad_norm": 0.6401302218437195, + "learning_rate": 1.9970050881661e-05, + "loss": 1.0674, + "step": 77 + }, + { + "epoch": 0.028581898131183583, + "grad_norm": 0.6552327871322632, + "learning_rate": 1.9969150669829242e-05, + "loss": 1.088, + "step": 78 + }, + { + "epoch": 0.028948332722609015, + "grad_norm": 0.5755530595779419, + "learning_rate": 1.996823714918085e-05, + "loss": 1.0717, + "step": 79 + }, + { + "epoch": 0.029314767314034446, + "grad_norm": 0.6203620433807373, + "learning_rate": 1.996731032093536e-05, + "loss": 1.0951, + "step": 80 + }, + { + "epoch": 0.029681201905459875, + "grad_norm": 0.6365242004394531, + "learning_rate": 1.99663701863301e-05, + "loss": 1.1612, + "step": 81 + }, + { + "epoch": 0.030047636496885306, + "grad_norm": 0.7320545315742493, + "learning_rate": 1.9965416746620147e-05, + "loss": 1.0666, + "step": 82 + }, + { + "epoch": 0.030414071088310738, + "grad_norm": 0.6602171659469604, + "learning_rate": 1.9964450003078336e-05, + "loss": 1.0418, + "step": 83 + }, + { + "epoch": 0.030780505679736166, + "grad_norm": 0.6662451028823853, + "learning_rate": 1.9963469956995278e-05, + "loss": 1.0383, + "step": 84 + }, + { + "epoch": 0.031146940271161598, + "grad_norm": 0.7394860982894897, + "learning_rate": 1.9962476609679328e-05, + "loss": 1.0124, + "step": 85 + }, + { + "epoch": 0.031513374862587026, + "grad_norm": 0.6913645267486572, + "learning_rate": 1.9961469962456613e-05, + "loss": 1.1247, + "step": 86 + }, + { + "epoch": 0.03187980945401246, + "grad_norm": 0.8122132420539856, + "learning_rate": 1.9960450016670997e-05, + "loss": 0.9204, + "step": 87 + }, + { + "epoch": 0.03224624404543789, + "grad_norm": 0.6972271800041199, + "learning_rate": 1.9959416773684113e-05, + "loss": 1.0335, + "step": 88 + }, + { + "epoch": 0.03261267863686332, + "grad_norm": 0.7278566956520081, + "learning_rate": 1.9958370234875345e-05, + "loss": 1.022, + "step": 89 + }, + { + "epoch": 0.03297911322828875, + "grad_norm": 0.6476576328277588, + "learning_rate": 1.9957310401641817e-05, + "loss": 1.0114, + "step": 90 + }, + { + "epoch": 0.03334554781971418, + "grad_norm": 0.7813729643821716, + "learning_rate": 1.9956237275398402e-05, + "loss": 0.9962, + "step": 91 + }, + { + "epoch": 0.03371198241113961, + "grad_norm": 0.7484748959541321, + "learning_rate": 1.9955150857577733e-05, + "loss": 1.1148, + "step": 92 + }, + { + "epoch": 0.034078417002565045, + "grad_norm": 0.6738268136978149, + "learning_rate": 1.995405114963017e-05, + "loss": 1.1177, + "step": 93 + }, + { + "epoch": 0.03444485159399047, + "grad_norm": 0.8074272274971008, + "learning_rate": 1.9952938153023828e-05, + "loss": 1.0971, + "step": 94 + }, + { + "epoch": 0.0348112861854159, + "grad_norm": 0.6509901881217957, + "learning_rate": 1.995181186924456e-05, + "loss": 1.1152, + "step": 95 + }, + { + "epoch": 0.03517772077684134, + "grad_norm": 0.7371155619621277, + "learning_rate": 1.9950672299795947e-05, + "loss": 1.0771, + "step": 96 + }, + { + "epoch": 0.035544155368266765, + "grad_norm": 0.8736119270324707, + "learning_rate": 1.994951944619932e-05, + "loss": 1.0211, + "step": 97 + }, + { + "epoch": 0.035910589959692193, + "grad_norm": 0.8305104970932007, + "learning_rate": 1.9948353309993735e-05, + "loss": 0.9812, + "step": 98 + }, + { + "epoch": 0.03627702455111763, + "grad_norm": 0.7285597920417786, + "learning_rate": 1.994717389273599e-05, + "loss": 1.0571, + "step": 99 + }, + { + "epoch": 0.03664345914254306, + "grad_norm": 0.6941920518875122, + "learning_rate": 1.9945981196000598e-05, + "loss": 1.0796, + "step": 100 + }, + { + "epoch": 0.037009893733968485, + "grad_norm": 0.7027592062950134, + "learning_rate": 1.9944775221379817e-05, + "loss": 1.0951, + "step": 101 + }, + { + "epoch": 0.03737632832539392, + "grad_norm": 0.7003068327903748, + "learning_rate": 1.9943555970483615e-05, + "loss": 1.0421, + "step": 102 + }, + { + "epoch": 0.03774276291681935, + "grad_norm": 0.7429030537605286, + "learning_rate": 1.9942323444939706e-05, + "loss": 1.1032, + "step": 103 + }, + { + "epoch": 0.03810919750824478, + "grad_norm": 0.7443472743034363, + "learning_rate": 1.99410776463935e-05, + "loss": 1.0299, + "step": 104 + }, + { + "epoch": 0.03847563209967021, + "grad_norm": 0.7034825086593628, + "learning_rate": 1.9939818576508138e-05, + "loss": 1.0014, + "step": 105 + }, + { + "epoch": 0.03884206669109564, + "grad_norm": 0.6462461948394775, + "learning_rate": 1.9938546236964482e-05, + "loss": 1.034, + "step": 106 + }, + { + "epoch": 0.03920850128252107, + "grad_norm": 0.981590747833252, + "learning_rate": 1.9937260629461107e-05, + "loss": 1.0843, + "step": 107 + }, + { + "epoch": 0.0395749358739465, + "grad_norm": 0.746054470539093, + "learning_rate": 1.9935961755714298e-05, + "loss": 1.0168, + "step": 108 + }, + { + "epoch": 0.03994137046537193, + "grad_norm": 0.6923157572746277, + "learning_rate": 1.9934649617458044e-05, + "loss": 1.0926, + "step": 109 + }, + { + "epoch": 0.04030780505679736, + "grad_norm": 0.8634763360023499, + "learning_rate": 1.9933324216444064e-05, + "loss": 1.093, + "step": 110 + }, + { + "epoch": 0.04067423964822279, + "grad_norm": 0.773468554019928, + "learning_rate": 1.9931985554441754e-05, + "loss": 1.0596, + "step": 111 + }, + { + "epoch": 0.041040674239648224, + "grad_norm": 0.6959455609321594, + "learning_rate": 1.993063363323824e-05, + "loss": 1.085, + "step": 112 + }, + { + "epoch": 0.04140710883107365, + "grad_norm": 0.7528244256973267, + "learning_rate": 1.9929268454638326e-05, + "loss": 0.9813, + "step": 113 + }, + { + "epoch": 0.04177354342249908, + "grad_norm": 0.6986820697784424, + "learning_rate": 1.9927890020464535e-05, + "loss": 1.0718, + "step": 114 + }, + { + "epoch": 0.042139978013924516, + "grad_norm": 0.790373682975769, + "learning_rate": 1.992649833255707e-05, + "loss": 0.9903, + "step": 115 + }, + { + "epoch": 0.042506412605349944, + "grad_norm": 0.7352178692817688, + "learning_rate": 1.9925093392773837e-05, + "loss": 1.0697, + "step": 116 + }, + { + "epoch": 0.04287284719677537, + "grad_norm": 0.8071436882019043, + "learning_rate": 1.9923675202990428e-05, + "loss": 0.927, + "step": 117 + }, + { + "epoch": 0.04323928178820081, + "grad_norm": 0.912890613079071, + "learning_rate": 1.9922243765100133e-05, + "loss": 0.9728, + "step": 118 + }, + { + "epoch": 0.043605716379626236, + "grad_norm": 0.9719120860099792, + "learning_rate": 1.992079908101392e-05, + "loss": 0.989, + "step": 119 + }, + { + "epoch": 0.043972150971051664, + "grad_norm": 0.7970651984214783, + "learning_rate": 1.9919341152660433e-05, + "loss": 1.0516, + "step": 120 + }, + { + "epoch": 0.0443385855624771, + "grad_norm": 0.7409046292304993, + "learning_rate": 1.9917869981986014e-05, + "loss": 1.0232, + "step": 121 + }, + { + "epoch": 0.04470502015390253, + "grad_norm": 0.7631481885910034, + "learning_rate": 1.991638557095468e-05, + "loss": 1.0237, + "step": 122 + }, + { + "epoch": 0.045071454745327956, + "grad_norm": 0.7730304002761841, + "learning_rate": 1.9914887921548112e-05, + "loss": 0.9949, + "step": 123 + }, + { + "epoch": 0.04543788933675339, + "grad_norm": 0.8115593791007996, + "learning_rate": 1.9913377035765677e-05, + "loss": 1.0673, + "step": 124 + }, + { + "epoch": 0.04580432392817882, + "grad_norm": 0.8105559349060059, + "learning_rate": 1.9911852915624405e-05, + "loss": 1.0031, + "step": 125 + }, + { + "epoch": 0.04617075851960425, + "grad_norm": 0.807749330997467, + "learning_rate": 1.9910315563159e-05, + "loss": 1.0177, + "step": 126 + }, + { + "epoch": 0.04653719311102968, + "grad_norm": 0.8281621336936951, + "learning_rate": 1.990876498042182e-05, + "loss": 1.0317, + "step": 127 + }, + { + "epoch": 0.04690362770245511, + "grad_norm": 0.7854720950126648, + "learning_rate": 1.9907201169482894e-05, + "loss": 1.0084, + "step": 128 + }, + { + "epoch": 0.04727006229388054, + "grad_norm": 0.7980220913887024, + "learning_rate": 1.9905624132429922e-05, + "loss": 1.005, + "step": 129 + }, + { + "epoch": 0.047636496885305975, + "grad_norm": 0.7824934720993042, + "learning_rate": 1.9904033871368232e-05, + "loss": 1.0137, + "step": 130 + }, + { + "epoch": 0.0480029314767314, + "grad_norm": 0.8496504426002502, + "learning_rate": 1.990243038842084e-05, + "loss": 1.069, + "step": 131 + }, + { + "epoch": 0.04836936606815683, + "grad_norm": 0.8010658025741577, + "learning_rate": 1.990081368572838e-05, + "loss": 0.9737, + "step": 132 + }, + { + "epoch": 0.04873580065958227, + "grad_norm": 0.8724714517593384, + "learning_rate": 1.9899183765449157e-05, + "loss": 0.9907, + "step": 133 + }, + { + "epoch": 0.049102235251007695, + "grad_norm": 0.8537774085998535, + "learning_rate": 1.9897540629759116e-05, + "loss": 1.0092, + "step": 134 + }, + { + "epoch": 0.04946866984243312, + "grad_norm": 0.7882579565048218, + "learning_rate": 1.9895884280851843e-05, + "loss": 1.0597, + "step": 135 + }, + { + "epoch": 0.04983510443385856, + "grad_norm": 0.9498627781867981, + "learning_rate": 1.989421472093856e-05, + "loss": 0.8423, + "step": 136 + }, + { + "epoch": 0.05020153902528399, + "grad_norm": 0.774014413356781, + "learning_rate": 1.9892531952248132e-05, + "loss": 1.0462, + "step": 137 + }, + { + "epoch": 0.050567973616709415, + "grad_norm": 0.7702706456184387, + "learning_rate": 1.989083597702705e-05, + "loss": 1.0541, + "step": 138 + }, + { + "epoch": 0.05093440820813485, + "grad_norm": 0.8901228308677673, + "learning_rate": 1.9889126797539454e-05, + "loss": 0.9953, + "step": 139 + }, + { + "epoch": 0.05130084279956028, + "grad_norm": 0.9667432308197021, + "learning_rate": 1.988740441606709e-05, + "loss": 0.8933, + "step": 140 + }, + { + "epoch": 0.05166727739098571, + "grad_norm": 0.8098576068878174, + "learning_rate": 1.9885668834909335e-05, + "loss": 0.9936, + "step": 141 + }, + { + "epoch": 0.05203371198241114, + "grad_norm": 0.8478096723556519, + "learning_rate": 1.9883920056383195e-05, + "loss": 1.0868, + "step": 142 + }, + { + "epoch": 0.05240014657383657, + "grad_norm": 1.008238434791565, + "learning_rate": 1.9882158082823287e-05, + "loss": 0.9707, + "step": 143 + }, + { + "epoch": 0.052766581165262, + "grad_norm": 0.9024046063423157, + "learning_rate": 1.9880382916581848e-05, + "loss": 0.9777, + "step": 144 + }, + { + "epoch": 0.053133015756687434, + "grad_norm": 0.8705045580863953, + "learning_rate": 1.9878594560028725e-05, + "loss": 1.0243, + "step": 145 + }, + { + "epoch": 0.05349945034811286, + "grad_norm": 0.7713609337806702, + "learning_rate": 1.987679301555137e-05, + "loss": 1.011, + "step": 146 + }, + { + "epoch": 0.05386588493953829, + "grad_norm": 0.9589905142784119, + "learning_rate": 1.9874978285554846e-05, + "loss": 0.9648, + "step": 147 + }, + { + "epoch": 0.054232319530963725, + "grad_norm": 0.8619270920753479, + "learning_rate": 1.987315037246182e-05, + "loss": 0.9608, + "step": 148 + }, + { + "epoch": 0.054598754122389154, + "grad_norm": 0.789116382598877, + "learning_rate": 1.9871309278712558e-05, + "loss": 1.0212, + "step": 149 + }, + { + "epoch": 0.05496518871381458, + "grad_norm": 0.8430724740028381, + "learning_rate": 1.986945500676491e-05, + "loss": 1.0186, + "step": 150 + }, + { + "epoch": 0.05533162330524002, + "grad_norm": 0.9624394774436951, + "learning_rate": 1.9867587559094343e-05, + "loss": 0.9897, + "step": 151 + }, + { + "epoch": 0.055698057896665445, + "grad_norm": 0.9204763770103455, + "learning_rate": 1.986570693819389e-05, + "loss": 0.9457, + "step": 152 + }, + { + "epoch": 0.056064492488090874, + "grad_norm": 1.0764364004135132, + "learning_rate": 1.9863813146574186e-05, + "loss": 0.9249, + "step": 153 + }, + { + "epoch": 0.05643092707951631, + "grad_norm": 0.8509868383407593, + "learning_rate": 1.9861906186763443e-05, + "loss": 0.9667, + "step": 154 + }, + { + "epoch": 0.05679736167094174, + "grad_norm": 0.7725467681884766, + "learning_rate": 1.9859986061307443e-05, + "loss": 0.9902, + "step": 155 + }, + { + "epoch": 0.057163796262367166, + "grad_norm": 0.933456301689148, + "learning_rate": 1.985805277276956e-05, + "loss": 0.9678, + "step": 156 + }, + { + "epoch": 0.0575302308537926, + "grad_norm": 0.8996209502220154, + "learning_rate": 1.9856106323730742e-05, + "loss": 0.9796, + "step": 157 + }, + { + "epoch": 0.05789666544521803, + "grad_norm": 0.8602309823036194, + "learning_rate": 1.9854146716789488e-05, + "loss": 1.0265, + "step": 158 + }, + { + "epoch": 0.05826310003664346, + "grad_norm": 0.9406113624572754, + "learning_rate": 1.9852173954561875e-05, + "loss": 0.9452, + "step": 159 + }, + { + "epoch": 0.05862953462806889, + "grad_norm": 0.7956333160400391, + "learning_rate": 1.985018803968154e-05, + "loss": 1.061, + "step": 160 + }, + { + "epoch": 0.05899596921949432, + "grad_norm": 0.7723748087882996, + "learning_rate": 1.9848188974799686e-05, + "loss": 1.0782, + "step": 161 + }, + { + "epoch": 0.05936240381091975, + "grad_norm": 1.1234358549118042, + "learning_rate": 1.9846176762585058e-05, + "loss": 0.8828, + "step": 162 + }, + { + "epoch": 0.059728838402345184, + "grad_norm": 0.8539266586303711, + "learning_rate": 1.9844151405723962e-05, + "loss": 0.9929, + "step": 163 + }, + { + "epoch": 0.06009527299377061, + "grad_norm": 0.8535122871398926, + "learning_rate": 1.9842112906920247e-05, + "loss": 0.9884, + "step": 164 + }, + { + "epoch": 0.06046170758519604, + "grad_norm": 0.7588261365890503, + "learning_rate": 1.9840061268895315e-05, + "loss": 1.0081, + "step": 165 + }, + { + "epoch": 0.060828142176621476, + "grad_norm": 0.7827401757240295, + "learning_rate": 1.9837996494388098e-05, + "loss": 0.9491, + "step": 166 + }, + { + "epoch": 0.061194576768046904, + "grad_norm": 0.7755241990089417, + "learning_rate": 1.983591858615507e-05, + "loss": 1.056, + "step": 167 + }, + { + "epoch": 0.06156101135947233, + "grad_norm": 0.8388570547103882, + "learning_rate": 1.983382754697024e-05, + "loss": 0.9922, + "step": 168 + }, + { + "epoch": 0.06192744595089777, + "grad_norm": 0.9278474450111389, + "learning_rate": 1.983172337962515e-05, + "loss": 0.912, + "step": 169 + }, + { + "epoch": 0.062293880542323196, + "grad_norm": 0.847387969493866, + "learning_rate": 1.982960608692885e-05, + "loss": 1.0021, + "step": 170 + }, + { + "epoch": 0.06266031513374863, + "grad_norm": 0.8056350350379944, + "learning_rate": 1.982747567170794e-05, + "loss": 1.0151, + "step": 171 + }, + { + "epoch": 0.06302674972517405, + "grad_norm": 0.6489182114601135, + "learning_rate": 1.9825332136806516e-05, + "loss": 0.6902, + "step": 172 + }, + { + "epoch": 0.06339318431659949, + "grad_norm": 0.8523520827293396, + "learning_rate": 1.9823175485086203e-05, + "loss": 1.0431, + "step": 173 + }, + { + "epoch": 0.06375961890802492, + "grad_norm": 0.8439933061599731, + "learning_rate": 1.982100571942613e-05, + "loss": 1.0139, + "step": 174 + }, + { + "epoch": 0.06412605349945034, + "grad_norm": 0.7784318327903748, + "learning_rate": 1.9818822842722926e-05, + "loss": 0.9976, + "step": 175 + }, + { + "epoch": 0.06449248809087578, + "grad_norm": 0.9022171497344971, + "learning_rate": 1.981662685789074e-05, + "loss": 0.9754, + "step": 176 + }, + { + "epoch": 0.06485892268230121, + "grad_norm": 0.8426647186279297, + "learning_rate": 1.9814417767861216e-05, + "loss": 0.9734, + "step": 177 + }, + { + "epoch": 0.06522535727372664, + "grad_norm": 0.8823671340942383, + "learning_rate": 1.9812195575583478e-05, + "loss": 1.0413, + "step": 178 + }, + { + "epoch": 0.06559179186515207, + "grad_norm": 0.8977214694023132, + "learning_rate": 1.980996028402416e-05, + "loss": 0.959, + "step": 179 + }, + { + "epoch": 0.0659582264565775, + "grad_norm": 0.8814698457717896, + "learning_rate": 1.980771189616737e-05, + "loss": 0.9576, + "step": 180 + }, + { + "epoch": 0.06632466104800293, + "grad_norm": 1.0457162857055664, + "learning_rate": 1.9805450415014714e-05, + "loss": 0.9335, + "step": 181 + }, + { + "epoch": 0.06669109563942836, + "grad_norm": 0.9762332439422607, + "learning_rate": 1.9803175843585268e-05, + "loss": 0.944, + "step": 182 + }, + { + "epoch": 0.0670575302308538, + "grad_norm": 0.8236628770828247, + "learning_rate": 1.9800888184915582e-05, + "loss": 1.0402, + "step": 183 + }, + { + "epoch": 0.06742396482227922, + "grad_norm": 1.0684088468551636, + "learning_rate": 1.9798587442059676e-05, + "loss": 1.011, + "step": 184 + }, + { + "epoch": 0.06779039941370466, + "grad_norm": 0.9889763593673706, + "learning_rate": 1.979627361808905e-05, + "loss": 0.9743, + "step": 185 + }, + { + "epoch": 0.06815683400513009, + "grad_norm": 0.8603727221488953, + "learning_rate": 1.9793946716092658e-05, + "loss": 1.0072, + "step": 186 + }, + { + "epoch": 0.06852326859655551, + "grad_norm": 0.998291552066803, + "learning_rate": 1.979160673917691e-05, + "loss": 0.8558, + "step": 187 + }, + { + "epoch": 0.06888970318798095, + "grad_norm": 0.8995135426521301, + "learning_rate": 1.978925369046568e-05, + "loss": 1.0158, + "step": 188 + }, + { + "epoch": 0.06925613777940638, + "grad_norm": 1.0453314781188965, + "learning_rate": 1.9786887573100283e-05, + "loss": 0.9511, + "step": 189 + }, + { + "epoch": 0.0696225723708318, + "grad_norm": 0.883664608001709, + "learning_rate": 1.9784508390239492e-05, + "loss": 0.9613, + "step": 190 + }, + { + "epoch": 0.06998900696225724, + "grad_norm": 0.8757372498512268, + "learning_rate": 1.978211614505951e-05, + "loss": 1.0289, + "step": 191 + }, + { + "epoch": 0.07035544155368267, + "grad_norm": 0.9564153552055359, + "learning_rate": 1.977971084075399e-05, + "loss": 0.9337, + "step": 192 + }, + { + "epoch": 0.0707218761451081, + "grad_norm": 0.9205679893493652, + "learning_rate": 1.9777292480534008e-05, + "loss": 0.9985, + "step": 193 + }, + { + "epoch": 0.07108831073653353, + "grad_norm": 0.8132325410842896, + "learning_rate": 1.977486106762808e-05, + "loss": 0.9598, + "step": 194 + }, + { + "epoch": 0.07145474532795897, + "grad_norm": 0.9424161911010742, + "learning_rate": 1.9772416605282135e-05, + "loss": 0.9823, + "step": 195 + }, + { + "epoch": 0.07182117991938439, + "grad_norm": 0.9367775917053223, + "learning_rate": 1.9769959096759538e-05, + "loss": 1.0072, + "step": 196 + }, + { + "epoch": 0.07218761451080982, + "grad_norm": 0.8410287499427795, + "learning_rate": 1.9767488545341062e-05, + "loss": 1.0155, + "step": 197 + }, + { + "epoch": 0.07255404910223526, + "grad_norm": 1.0502501726150513, + "learning_rate": 1.9765004954324892e-05, + "loss": 0.9272, + "step": 198 + }, + { + "epoch": 0.07292048369366068, + "grad_norm": 0.9136397838592529, + "learning_rate": 1.976250832702662e-05, + "loss": 0.9435, + "step": 199 + }, + { + "epoch": 0.07328691828508611, + "grad_norm": 1.0021872520446777, + "learning_rate": 1.9759998666779245e-05, + "loss": 0.9796, + "step": 200 + }, + { + "epoch": 0.07365335287651155, + "grad_norm": 0.9359725713729858, + "learning_rate": 1.9757475976933165e-05, + "loss": 0.987, + "step": 201 + }, + { + "epoch": 0.07401978746793697, + "grad_norm": 1.0036094188690186, + "learning_rate": 1.9754940260856172e-05, + "loss": 0.8347, + "step": 202 + }, + { + "epoch": 0.0743862220593624, + "grad_norm": 0.9520359039306641, + "learning_rate": 1.975239152193344e-05, + "loss": 0.9691, + "step": 203 + }, + { + "epoch": 0.07475265665078784, + "grad_norm": 0.9743679761886597, + "learning_rate": 1.9749829763567545e-05, + "loss": 0.9516, + "step": 204 + }, + { + "epoch": 0.07511909124221326, + "grad_norm": 0.9027923941612244, + "learning_rate": 1.974725498917843e-05, + "loss": 0.9631, + "step": 205 + }, + { + "epoch": 0.0754855258336387, + "grad_norm": 0.9035953283309937, + "learning_rate": 1.9744667202203418e-05, + "loss": 1.0298, + "step": 206 + }, + { + "epoch": 0.07585196042506413, + "grad_norm": 0.9111297726631165, + "learning_rate": 1.9742066406097206e-05, + "loss": 0.9723, + "step": 207 + }, + { + "epoch": 0.07621839501648955, + "grad_norm": 0.8764698505401611, + "learning_rate": 1.9739452604331853e-05, + "loss": 1.0585, + "step": 208 + }, + { + "epoch": 0.07658482960791499, + "grad_norm": 1.0158146619796753, + "learning_rate": 1.9736825800396796e-05, + "loss": 1.0051, + "step": 209 + }, + { + "epoch": 0.07695126419934042, + "grad_norm": 0.9445521831512451, + "learning_rate": 1.9734185997798805e-05, + "loss": 1.0046, + "step": 210 + }, + { + "epoch": 0.07731769879076585, + "grad_norm": 1.1613709926605225, + "learning_rate": 1.9731533200062026e-05, + "loss": 0.8739, + "step": 211 + }, + { + "epoch": 0.07768413338219128, + "grad_norm": 0.8824067115783691, + "learning_rate": 1.9728867410727936e-05, + "loss": 1.0177, + "step": 212 + }, + { + "epoch": 0.07805056797361672, + "grad_norm": 0.8832942843437195, + "learning_rate": 1.9726188633355373e-05, + "loss": 1.1069, + "step": 213 + }, + { + "epoch": 0.07841700256504214, + "grad_norm": 0.895889163017273, + "learning_rate": 1.9723496871520494e-05, + "loss": 0.9416, + "step": 214 + }, + { + "epoch": 0.07878343715646757, + "grad_norm": 0.9018993377685547, + "learning_rate": 1.9720792128816812e-05, + "loss": 0.9647, + "step": 215 + }, + { + "epoch": 0.079149871747893, + "grad_norm": 0.9014832973480225, + "learning_rate": 1.9718074408855156e-05, + "loss": 1.0068, + "step": 216 + }, + { + "epoch": 0.07951630633931843, + "grad_norm": 0.9585338234901428, + "learning_rate": 1.9715343715263676e-05, + "loss": 1.0059, + "step": 217 + }, + { + "epoch": 0.07988274093074386, + "grad_norm": 0.8547597527503967, + "learning_rate": 1.9712600051687853e-05, + "loss": 1.0381, + "step": 218 + }, + { + "epoch": 0.08024917552216929, + "grad_norm": 0.9184234142303467, + "learning_rate": 1.9709843421790478e-05, + "loss": 0.9581, + "step": 219 + }, + { + "epoch": 0.08061561011359472, + "grad_norm": 0.9227030873298645, + "learning_rate": 1.970707382925165e-05, + "loss": 0.9675, + "step": 220 + }, + { + "epoch": 0.08098204470502016, + "grad_norm": 0.8544616103172302, + "learning_rate": 1.9704291277768777e-05, + "loss": 0.9977, + "step": 221 + }, + { + "epoch": 0.08134847929644558, + "grad_norm": 0.9003406167030334, + "learning_rate": 1.970149577105657e-05, + "loss": 0.9491, + "step": 222 + }, + { + "epoch": 0.08171491388787101, + "grad_norm": 0.9470295310020447, + "learning_rate": 1.969868731284702e-05, + "loss": 0.9453, + "step": 223 + }, + { + "epoch": 0.08208134847929645, + "grad_norm": 0.9368393421173096, + "learning_rate": 1.9695865906889428e-05, + "loss": 0.9945, + "step": 224 + }, + { + "epoch": 0.08244778307072187, + "grad_norm": 0.8223913908004761, + "learning_rate": 1.9693031556950367e-05, + "loss": 0.9732, + "step": 225 + }, + { + "epoch": 0.0828142176621473, + "grad_norm": 0.8167693018913269, + "learning_rate": 1.969018426681369e-05, + "loss": 0.9945, + "step": 226 + }, + { + "epoch": 0.08318065225357274, + "grad_norm": 0.9292685389518738, + "learning_rate": 1.968732404028054e-05, + "loss": 0.9716, + "step": 227 + }, + { + "epoch": 0.08354708684499816, + "grad_norm": 0.864932119846344, + "learning_rate": 1.968445088116931e-05, + "loss": 0.9755, + "step": 228 + }, + { + "epoch": 0.0839135214364236, + "grad_norm": 1.0593254566192627, + "learning_rate": 1.9681564793315676e-05, + "loss": 0.9336, + "step": 229 + }, + { + "epoch": 0.08427995602784903, + "grad_norm": 0.8241183161735535, + "learning_rate": 1.967866578057256e-05, + "loss": 0.9613, + "step": 230 + }, + { + "epoch": 0.08464639061927445, + "grad_norm": 0.9200891852378845, + "learning_rate": 1.967575384681014e-05, + "loss": 0.9725, + "step": 231 + }, + { + "epoch": 0.08501282521069989, + "grad_norm": 0.9276204109191895, + "learning_rate": 1.967282899591586e-05, + "loss": 0.9295, + "step": 232 + }, + { + "epoch": 0.08537925980212532, + "grad_norm": 1.0053272247314453, + "learning_rate": 1.9669891231794383e-05, + "loss": 0.9801, + "step": 233 + }, + { + "epoch": 0.08574569439355074, + "grad_norm": 0.9977024793624878, + "learning_rate": 1.9666940558367635e-05, + "loss": 0.9326, + "step": 234 + }, + { + "epoch": 0.08611212898497618, + "grad_norm": 0.9312626123428345, + "learning_rate": 1.9663976979574753e-05, + "loss": 0.895, + "step": 235 + }, + { + "epoch": 0.08647856357640162, + "grad_norm": 0.917569100856781, + "learning_rate": 1.9661000499372127e-05, + "loss": 0.9973, + "step": 236 + }, + { + "epoch": 0.08684499816782704, + "grad_norm": 1.0833723545074463, + "learning_rate": 1.9658011121733353e-05, + "loss": 0.9263, + "step": 237 + }, + { + "epoch": 0.08721143275925247, + "grad_norm": 0.8841248750686646, + "learning_rate": 1.9655008850649252e-05, + "loss": 0.9975, + "step": 238 + }, + { + "epoch": 0.08757786735067791, + "grad_norm": 0.9899241328239441, + "learning_rate": 1.965199369012785e-05, + "loss": 0.9774, + "step": 239 + }, + { + "epoch": 0.08794430194210333, + "grad_norm": 0.9991698265075684, + "learning_rate": 1.9648965644194396e-05, + "loss": 0.9299, + "step": 240 + }, + { + "epoch": 0.08831073653352876, + "grad_norm": 0.8291311264038086, + "learning_rate": 1.9645924716891326e-05, + "loss": 1.0274, + "step": 241 + }, + { + "epoch": 0.0886771711249542, + "grad_norm": 0.9432514309883118, + "learning_rate": 1.9642870912278275e-05, + "loss": 0.98, + "step": 242 + }, + { + "epoch": 0.08904360571637962, + "grad_norm": 0.9141779541969299, + "learning_rate": 1.963980423443208e-05, + "loss": 1.0134, + "step": 243 + }, + { + "epoch": 0.08941004030780506, + "grad_norm": 1.1006799936294556, + "learning_rate": 1.963672468744675e-05, + "loss": 0.9813, + "step": 244 + }, + { + "epoch": 0.08977647489923049, + "grad_norm": 1.022298812866211, + "learning_rate": 1.963363227543348e-05, + "loss": 0.9569, + "step": 245 + }, + { + "epoch": 0.09014290949065591, + "grad_norm": 0.8487532734870911, + "learning_rate": 1.9630527002520646e-05, + "loss": 1.0137, + "step": 246 + }, + { + "epoch": 0.09050934408208135, + "grad_norm": 1.1966487169265747, + "learning_rate": 1.9627408872853788e-05, + "loss": 0.8798, + "step": 247 + }, + { + "epoch": 0.09087577867350678, + "grad_norm": 0.9319120049476624, + "learning_rate": 1.96242778905956e-05, + "loss": 0.9981, + "step": 248 + }, + { + "epoch": 0.0912422132649322, + "grad_norm": 0.9164144992828369, + "learning_rate": 1.962113405992595e-05, + "loss": 0.9609, + "step": 249 + }, + { + "epoch": 0.09160864785635764, + "grad_norm": 1.06501042842865, + "learning_rate": 1.961797738504185e-05, + "loss": 0.9288, + "step": 250 + }, + { + "epoch": 0.09197508244778307, + "grad_norm": 0.9936481714248657, + "learning_rate": 1.961480787015746e-05, + "loss": 0.9782, + "step": 251 + }, + { + "epoch": 0.0923415170392085, + "grad_norm": 0.989457368850708, + "learning_rate": 1.961162551950409e-05, + "loss": 0.969, + "step": 252 + }, + { + "epoch": 0.09270795163063393, + "grad_norm": 1.0204919576644897, + "learning_rate": 1.9608430337330172e-05, + "loss": 1.0042, + "step": 253 + }, + { + "epoch": 0.09307438622205937, + "grad_norm": 0.9626120924949646, + "learning_rate": 1.9605222327901274e-05, + "loss": 0.9647, + "step": 254 + }, + { + "epoch": 0.09344082081348479, + "grad_norm": 0.9840561151504517, + "learning_rate": 1.9602001495500087e-05, + "loss": 0.9892, + "step": 255 + }, + { + "epoch": 0.09380725540491022, + "grad_norm": 0.9287786483764648, + "learning_rate": 1.959876784442643e-05, + "loss": 0.9548, + "step": 256 + }, + { + "epoch": 0.09417368999633566, + "grad_norm": 0.9564248323440552, + "learning_rate": 1.959552137899722e-05, + "loss": 0.9827, + "step": 257 + }, + { + "epoch": 0.09454012458776108, + "grad_norm": 0.9497889280319214, + "learning_rate": 1.959226210354649e-05, + "loss": 0.9986, + "step": 258 + }, + { + "epoch": 0.09490655917918651, + "grad_norm": 1.0784432888031006, + "learning_rate": 1.9588990022425378e-05, + "loss": 0.9232, + "step": 259 + }, + { + "epoch": 0.09527299377061195, + "grad_norm": 1.0418896675109863, + "learning_rate": 1.9585705140002107e-05, + "loss": 0.8728, + "step": 260 + }, + { + "epoch": 0.09563942836203737, + "grad_norm": 0.9301762580871582, + "learning_rate": 1.9582407460661995e-05, + "loss": 0.958, + "step": 261 + }, + { + "epoch": 0.0960058629534628, + "grad_norm": 1.0677168369293213, + "learning_rate": 1.9579096988807448e-05, + "loss": 0.8912, + "step": 262 + }, + { + "epoch": 0.09637229754488824, + "grad_norm": 0.9866990447044373, + "learning_rate": 1.9575773728857946e-05, + "loss": 0.955, + "step": 263 + }, + { + "epoch": 0.09673873213631366, + "grad_norm": 1.1164122819900513, + "learning_rate": 1.9572437685250038e-05, + "loss": 1.0106, + "step": 264 + }, + { + "epoch": 0.0971051667277391, + "grad_norm": 1.0720348358154297, + "learning_rate": 1.956908886243734e-05, + "loss": 0.9836, + "step": 265 + }, + { + "epoch": 0.09747160131916453, + "grad_norm": 1.0078245401382446, + "learning_rate": 1.956572726489054e-05, + "loss": 0.882, + "step": 266 + }, + { + "epoch": 0.09783803591058995, + "grad_norm": 0.9684966802597046, + "learning_rate": 1.9562352897097358e-05, + "loss": 1.0168, + "step": 267 + }, + { + "epoch": 0.09820447050201539, + "grad_norm": 1.0070780515670776, + "learning_rate": 1.9558965763562588e-05, + "loss": 0.9401, + "step": 268 + }, + { + "epoch": 0.09857090509344082, + "grad_norm": 1.065958023071289, + "learning_rate": 1.9555565868808047e-05, + "loss": 0.9685, + "step": 269 + }, + { + "epoch": 0.09893733968486625, + "grad_norm": 1.1749391555786133, + "learning_rate": 1.9552153217372592e-05, + "loss": 0.8862, + "step": 270 + }, + { + "epoch": 0.09930377427629168, + "grad_norm": 1.0479028224945068, + "learning_rate": 1.9548727813812117e-05, + "loss": 0.9078, + "step": 271 + }, + { + "epoch": 0.09967020886771712, + "grad_norm": 1.0470627546310425, + "learning_rate": 1.9545289662699537e-05, + "loss": 0.9752, + "step": 272 + }, + { + "epoch": 0.10003664345914254, + "grad_norm": 0.9723828434944153, + "learning_rate": 1.9541838768624785e-05, + "loss": 0.9602, + "step": 273 + }, + { + "epoch": 0.10040307805056797, + "grad_norm": 0.97696852684021, + "learning_rate": 1.9538375136194794e-05, + "loss": 0.9589, + "step": 274 + }, + { + "epoch": 0.10076951264199341, + "grad_norm": 0.9613191485404968, + "learning_rate": 1.9534898770033525e-05, + "loss": 0.9577, + "step": 275 + }, + { + "epoch": 0.10113594723341883, + "grad_norm": 0.932762086391449, + "learning_rate": 1.953140967478193e-05, + "loss": 0.9514, + "step": 276 + }, + { + "epoch": 0.10150238182484426, + "grad_norm": 0.9490351676940918, + "learning_rate": 1.9527907855097933e-05, + "loss": 1.0122, + "step": 277 + }, + { + "epoch": 0.1018688164162697, + "grad_norm": 0.9454355835914612, + "learning_rate": 1.9524393315656484e-05, + "loss": 0.9801, + "step": 278 + }, + { + "epoch": 0.10223525100769512, + "grad_norm": 0.9905188679695129, + "learning_rate": 1.952086606114948e-05, + "loss": 0.9325, + "step": 279 + }, + { + "epoch": 0.10260168559912056, + "grad_norm": 1.023087978363037, + "learning_rate": 1.9517326096285808e-05, + "loss": 0.9594, + "step": 280 + }, + { + "epoch": 0.10296812019054599, + "grad_norm": 0.9482645988464355, + "learning_rate": 1.951377342579133e-05, + "loss": 0.9682, + "step": 281 + }, + { + "epoch": 0.10333455478197141, + "grad_norm": 1.154199481010437, + "learning_rate": 1.951020805440885e-05, + "loss": 0.8358, + "step": 282 + }, + { + "epoch": 0.10370098937339685, + "grad_norm": 1.1318973302841187, + "learning_rate": 1.9506629986898144e-05, + "loss": 1.0204, + "step": 283 + }, + { + "epoch": 0.10406742396482228, + "grad_norm": 1.0413382053375244, + "learning_rate": 1.9503039228035936e-05, + "loss": 0.9936, + "step": 284 + }, + { + "epoch": 0.1044338585562477, + "grad_norm": 1.1161582469940186, + "learning_rate": 1.9499435782615882e-05, + "loss": 0.8052, + "step": 285 + }, + { + "epoch": 0.10480029314767314, + "grad_norm": 1.00542151927948, + "learning_rate": 1.9495819655448588e-05, + "loss": 0.9014, + "step": 286 + }, + { + "epoch": 0.10516672773909858, + "grad_norm": 1.0385977029800415, + "learning_rate": 1.9492190851361576e-05, + "loss": 0.995, + "step": 287 + }, + { + "epoch": 0.105533162330524, + "grad_norm": 1.0507909059524536, + "learning_rate": 1.948854937519931e-05, + "loss": 0.974, + "step": 288 + }, + { + "epoch": 0.10589959692194943, + "grad_norm": 0.9018959403038025, + "learning_rate": 1.9484895231823153e-05, + "loss": 0.9413, + "step": 289 + }, + { + "epoch": 0.10626603151337487, + "grad_norm": 0.9558141827583313, + "learning_rate": 1.9481228426111392e-05, + "loss": 0.978, + "step": 290 + }, + { + "epoch": 0.10663246610480029, + "grad_norm": 0.975355863571167, + "learning_rate": 1.9477548962959207e-05, + "loss": 1.0181, + "step": 291 + }, + { + "epoch": 0.10699890069622572, + "grad_norm": 0.9691599607467651, + "learning_rate": 1.9473856847278688e-05, + "loss": 0.9769, + "step": 292 + }, + { + "epoch": 0.10736533528765116, + "grad_norm": 1.0449618101119995, + "learning_rate": 1.94701520839988e-05, + "loss": 0.9867, + "step": 293 + }, + { + "epoch": 0.10773176987907658, + "grad_norm": 0.9220532178878784, + "learning_rate": 1.9466434678065416e-05, + "loss": 0.9805, + "step": 294 + }, + { + "epoch": 0.10809820447050202, + "grad_norm": 1.054179310798645, + "learning_rate": 1.946270463444126e-05, + "loss": 0.9521, + "step": 295 + }, + { + "epoch": 0.10846463906192745, + "grad_norm": 1.110876202583313, + "learning_rate": 1.9458961958105948e-05, + "loss": 0.9413, + "step": 296 + }, + { + "epoch": 0.10883107365335287, + "grad_norm": 0.9981120824813843, + "learning_rate": 1.945520665405595e-05, + "loss": 0.9817, + "step": 297 + }, + { + "epoch": 0.10919750824477831, + "grad_norm": 1.1343566179275513, + "learning_rate": 1.94514387273046e-05, + "loss": 0.9903, + "step": 298 + }, + { + "epoch": 0.10956394283620374, + "grad_norm": 1.007285237312317, + "learning_rate": 1.944765818288208e-05, + "loss": 1.0138, + "step": 299 + }, + { + "epoch": 0.10993037742762916, + "grad_norm": 1.0697396993637085, + "learning_rate": 1.944386502583541e-05, + "loss": 0.9171, + "step": 300 + }, + { + "epoch": 0.1102968120190546, + "grad_norm": 0.9989778995513916, + "learning_rate": 1.9440059261228463e-05, + "loss": 1.0735, + "step": 301 + }, + { + "epoch": 0.11066324661048003, + "grad_norm": 1.0370742082595825, + "learning_rate": 1.9436240894141933e-05, + "loss": 0.9576, + "step": 302 + }, + { + "epoch": 0.11102968120190546, + "grad_norm": 1.0512537956237793, + "learning_rate": 1.9432409929673336e-05, + "loss": 0.9519, + "step": 303 + }, + { + "epoch": 0.11139611579333089, + "grad_norm": 1.1380470991134644, + "learning_rate": 1.942856637293702e-05, + "loss": 0.93, + "step": 304 + }, + { + "epoch": 0.11176255038475633, + "grad_norm": 1.0147130489349365, + "learning_rate": 1.9424710229064117e-05, + "loss": 0.9551, + "step": 305 + }, + { + "epoch": 0.11212898497618175, + "grad_norm": 0.6644235849380493, + "learning_rate": 1.94208415032026e-05, + "loss": 0.6299, + "step": 306 + }, + { + "epoch": 0.11249541956760718, + "grad_norm": 1.1452592611312866, + "learning_rate": 1.94169602005172e-05, + "loss": 0.9289, + "step": 307 + }, + { + "epoch": 0.11286185415903262, + "grad_norm": 0.7639943361282349, + "learning_rate": 1.9413066326189463e-05, + "loss": 0.6014, + "step": 308 + }, + { + "epoch": 0.11322828875045804, + "grad_norm": 0.9849272966384888, + "learning_rate": 1.9409159885417717e-05, + "loss": 0.9686, + "step": 309 + }, + { + "epoch": 0.11359472334188347, + "grad_norm": 0.9223613142967224, + "learning_rate": 1.9405240883417052e-05, + "loss": 0.8893, + "step": 310 + }, + { + "epoch": 0.11396115793330891, + "grad_norm": 1.0126756429672241, + "learning_rate": 1.940130932541934e-05, + "loss": 0.9326, + "step": 311 + }, + { + "epoch": 0.11432759252473433, + "grad_norm": 1.0185225009918213, + "learning_rate": 1.939736521667321e-05, + "loss": 0.9744, + "step": 312 + }, + { + "epoch": 0.11469402711615977, + "grad_norm": 1.1406480073928833, + "learning_rate": 1.9393408562444048e-05, + "loss": 1.003, + "step": 313 + }, + { + "epoch": 0.1150604617075852, + "grad_norm": 1.078837513923645, + "learning_rate": 1.9389439368013987e-05, + "loss": 0.9552, + "step": 314 + }, + { + "epoch": 0.11542689629901062, + "grad_norm": 1.042822003364563, + "learning_rate": 1.9385457638681902e-05, + "loss": 0.9658, + "step": 315 + }, + { + "epoch": 0.11579333089043606, + "grad_norm": 0.9646812677383423, + "learning_rate": 1.9381463379763403e-05, + "loss": 0.9181, + "step": 316 + }, + { + "epoch": 0.1161597654818615, + "grad_norm": 1.1890491247177124, + "learning_rate": 1.9377456596590826e-05, + "loss": 0.885, + "step": 317 + }, + { + "epoch": 0.11652620007328691, + "grad_norm": 1.0086746215820312, + "learning_rate": 1.9373437294513225e-05, + "loss": 1.0218, + "step": 318 + }, + { + "epoch": 0.11689263466471235, + "grad_norm": 0.7628529071807861, + "learning_rate": 1.9369405478896367e-05, + "loss": 0.5956, + "step": 319 + }, + { + "epoch": 0.11725906925613779, + "grad_norm": 0.9907658100128174, + "learning_rate": 1.936536115512273e-05, + "loss": 0.9748, + "step": 320 + }, + { + "epoch": 0.1176255038475632, + "grad_norm": 0.9829033613204956, + "learning_rate": 1.9361304328591485e-05, + "loss": 0.9372, + "step": 321 + }, + { + "epoch": 0.11799193843898864, + "grad_norm": 0.9079287052154541, + "learning_rate": 1.9357235004718493e-05, + "loss": 0.9166, + "step": 322 + }, + { + "epoch": 0.11835837303041408, + "grad_norm": 0.9061397314071655, + "learning_rate": 1.9353153188936304e-05, + "loss": 0.913, + "step": 323 + }, + { + "epoch": 0.1187248076218395, + "grad_norm": 1.019878625869751, + "learning_rate": 1.9349058886694146e-05, + "loss": 0.9803, + "step": 324 + }, + { + "epoch": 0.11909124221326493, + "grad_norm": 1.2982869148254395, + "learning_rate": 1.9344952103457906e-05, + "loss": 0.9391, + "step": 325 + }, + { + "epoch": 0.11945767680469037, + "grad_norm": 1.0518271923065186, + "learning_rate": 1.9340832844710143e-05, + "loss": 0.9108, + "step": 326 + }, + { + "epoch": 0.11982411139611579, + "grad_norm": 1.0201774835586548, + "learning_rate": 1.9336701115950066e-05, + "loss": 0.9951, + "step": 327 + }, + { + "epoch": 0.12019054598754123, + "grad_norm": 1.065725564956665, + "learning_rate": 1.9332556922693537e-05, + "loss": 0.9327, + "step": 328 + }, + { + "epoch": 0.12055698057896666, + "grad_norm": 0.9934291839599609, + "learning_rate": 1.9328400270473046e-05, + "loss": 0.883, + "step": 329 + }, + { + "epoch": 0.12092341517039208, + "grad_norm": 1.004043459892273, + "learning_rate": 1.9324231164837733e-05, + "loss": 0.9671, + "step": 330 + }, + { + "epoch": 0.12128984976181752, + "grad_norm": 1.190549612045288, + "learning_rate": 1.9320049611353357e-05, + "loss": 0.8884, + "step": 331 + }, + { + "epoch": 0.12165628435324295, + "grad_norm": 1.0553498268127441, + "learning_rate": 1.9315855615602278e-05, + "loss": 1.0202, + "step": 332 + }, + { + "epoch": 0.12202271894466837, + "grad_norm": 1.1617465019226074, + "learning_rate": 1.931164918318349e-05, + "loss": 0.9008, + "step": 333 + }, + { + "epoch": 0.12238915353609381, + "grad_norm": 0.9961878657341003, + "learning_rate": 1.930743031971258e-05, + "loss": 1.0226, + "step": 334 + }, + { + "epoch": 0.12275558812751924, + "grad_norm": 1.0462695360183716, + "learning_rate": 1.9303199030821725e-05, + "loss": 0.9614, + "step": 335 + }, + { + "epoch": 0.12312202271894467, + "grad_norm": 1.0623806715011597, + "learning_rate": 1.9298955322159708e-05, + "loss": 0.9325, + "step": 336 + }, + { + "epoch": 0.1234884573103701, + "grad_norm": 1.0319890975952148, + "learning_rate": 1.9294699199391868e-05, + "loss": 1.0047, + "step": 337 + }, + { + "epoch": 0.12385489190179554, + "grad_norm": 1.0860097408294678, + "learning_rate": 1.9290430668200134e-05, + "loss": 0.977, + "step": 338 + }, + { + "epoch": 0.12422132649322096, + "grad_norm": 1.0328339338302612, + "learning_rate": 1.9286149734282998e-05, + "loss": 0.9675, + "step": 339 + }, + { + "epoch": 0.12458776108464639, + "grad_norm": 0.9981843829154968, + "learning_rate": 1.9281856403355503e-05, + "loss": 1.0054, + "step": 340 + }, + { + "epoch": 0.12495419567607183, + "grad_norm": 1.0083777904510498, + "learning_rate": 1.9277550681149246e-05, + "loss": 0.9567, + "step": 341 + }, + { + "epoch": 0.12532063026749726, + "grad_norm": 0.9963028430938721, + "learning_rate": 1.9273232573412372e-05, + "loss": 0.9788, + "step": 342 + }, + { + "epoch": 0.12568706485892267, + "grad_norm": 1.0055075883865356, + "learning_rate": 1.9268902085909547e-05, + "loss": 0.9923, + "step": 343 + }, + { + "epoch": 0.1260534994503481, + "grad_norm": 1.0248360633850098, + "learning_rate": 1.9264559224421975e-05, + "loss": 0.9404, + "step": 344 + }, + { + "epoch": 0.12641993404177354, + "grad_norm": 1.0669928789138794, + "learning_rate": 1.926020399474738e-05, + "loss": 0.9099, + "step": 345 + }, + { + "epoch": 0.12678636863319898, + "grad_norm": 1.0234222412109375, + "learning_rate": 1.925583640269999e-05, + "loss": 0.993, + "step": 346 + }, + { + "epoch": 0.1271528032246244, + "grad_norm": 1.0699959993362427, + "learning_rate": 1.925145645411054e-05, + "loss": 0.8935, + "step": 347 + }, + { + "epoch": 0.12751923781604985, + "grad_norm": 1.0531938076019287, + "learning_rate": 1.9247064154826268e-05, + "loss": 0.996, + "step": 348 + }, + { + "epoch": 0.12788567240747525, + "grad_norm": 1.0274754762649536, + "learning_rate": 1.9242659510710885e-05, + "loss": 0.9521, + "step": 349 + }, + { + "epoch": 0.1282521069989007, + "grad_norm": 0.9951926469802856, + "learning_rate": 1.9238242527644595e-05, + "loss": 0.9261, + "step": 350 + }, + { + "epoch": 0.12861854159032612, + "grad_norm": 1.2358548641204834, + "learning_rate": 1.923381321152407e-05, + "loss": 0.8526, + "step": 351 + }, + { + "epoch": 0.12898497618175156, + "grad_norm": 0.9344589114189148, + "learning_rate": 1.9229371568262447e-05, + "loss": 0.9233, + "step": 352 + }, + { + "epoch": 0.129351410773177, + "grad_norm": 1.0159389972686768, + "learning_rate": 1.9224917603789323e-05, + "loss": 0.9833, + "step": 353 + }, + { + "epoch": 0.12971784536460243, + "grad_norm": 1.0737959146499634, + "learning_rate": 1.922045132405074e-05, + "loss": 0.9128, + "step": 354 + }, + { + "epoch": 0.13008427995602784, + "grad_norm": 1.0360455513000488, + "learning_rate": 1.9215972735009178e-05, + "loss": 0.9552, + "step": 355 + }, + { + "epoch": 0.13045071454745327, + "grad_norm": 1.0206668376922607, + "learning_rate": 1.9211481842643557e-05, + "loss": 0.936, + "step": 356 + }, + { + "epoch": 0.1308171491388787, + "grad_norm": 1.0223139524459839, + "learning_rate": 1.920697865294922e-05, + "loss": 0.9847, + "step": 357 + }, + { + "epoch": 0.13118358373030414, + "grad_norm": 1.066007375717163, + "learning_rate": 1.9202463171937917e-05, + "loss": 0.9398, + "step": 358 + }, + { + "epoch": 0.13155001832172958, + "grad_norm": 1.1262496709823608, + "learning_rate": 1.9197935405637824e-05, + "loss": 0.951, + "step": 359 + }, + { + "epoch": 0.131916452913155, + "grad_norm": 0.9768517017364502, + "learning_rate": 1.9193395360093507e-05, + "loss": 0.915, + "step": 360 + }, + { + "epoch": 0.13228288750458042, + "grad_norm": 0.9572597146034241, + "learning_rate": 1.9188843041365927e-05, + "loss": 0.9051, + "step": 361 + }, + { + "epoch": 0.13264932209600586, + "grad_norm": 0.963023841381073, + "learning_rate": 1.9184278455532427e-05, + "loss": 0.9233, + "step": 362 + }, + { + "epoch": 0.1330157566874313, + "grad_norm": 1.2647531032562256, + "learning_rate": 1.9179701608686734e-05, + "loss": 0.895, + "step": 363 + }, + { + "epoch": 0.13338219127885673, + "grad_norm": 1.116768717765808, + "learning_rate": 1.9175112506938934e-05, + "loss": 0.8941, + "step": 364 + }, + { + "epoch": 0.13374862587028216, + "grad_norm": 1.072059154510498, + "learning_rate": 1.9170511156415485e-05, + "loss": 0.9278, + "step": 365 + }, + { + "epoch": 0.1341150604617076, + "grad_norm": 1.1003344058990479, + "learning_rate": 1.916589756325918e-05, + "loss": 0.8847, + "step": 366 + }, + { + "epoch": 0.134481495053133, + "grad_norm": 1.0306068658828735, + "learning_rate": 1.9161271733629174e-05, + "loss": 0.938, + "step": 367 + }, + { + "epoch": 0.13484792964455844, + "grad_norm": 1.0388495922088623, + "learning_rate": 1.915663367370095e-05, + "loss": 1.0279, + "step": 368 + }, + { + "epoch": 0.13521436423598387, + "grad_norm": 1.0442951917648315, + "learning_rate": 1.9151983389666312e-05, + "loss": 0.8832, + "step": 369 + }, + { + "epoch": 0.1355807988274093, + "grad_norm": 1.3007270097732544, + "learning_rate": 1.9147320887733392e-05, + "loss": 0.8581, + "step": 370 + }, + { + "epoch": 0.13594723341883475, + "grad_norm": 1.0682867765426636, + "learning_rate": 1.9142646174126634e-05, + "loss": 0.9914, + "step": 371 + }, + { + "epoch": 0.13631366801026018, + "grad_norm": 1.3072115182876587, + "learning_rate": 1.913795925508678e-05, + "loss": 1.019, + "step": 372 + }, + { + "epoch": 0.1366801026016856, + "grad_norm": 1.0528311729431152, + "learning_rate": 1.9133260136870864e-05, + "loss": 0.9199, + "step": 373 + }, + { + "epoch": 0.13704653719311102, + "grad_norm": 0.9990109205245972, + "learning_rate": 1.912854882575221e-05, + "loss": 0.9312, + "step": 374 + }, + { + "epoch": 0.13741297178453646, + "grad_norm": 1.0572776794433594, + "learning_rate": 1.9123825328020417e-05, + "loss": 0.8832, + "step": 375 + }, + { + "epoch": 0.1377794063759619, + "grad_norm": 1.1701194047927856, + "learning_rate": 1.9119089649981362e-05, + "loss": 0.8376, + "step": 376 + }, + { + "epoch": 0.13814584096738733, + "grad_norm": 1.025697946548462, + "learning_rate": 1.9114341797957166e-05, + "loss": 1.0191, + "step": 377 + }, + { + "epoch": 0.13851227555881276, + "grad_norm": 1.1118934154510498, + "learning_rate": 1.9109581778286217e-05, + "loss": 0.9342, + "step": 378 + }, + { + "epoch": 0.13887871015023817, + "grad_norm": 1.0030187368392944, + "learning_rate": 1.910480959732314e-05, + "loss": 0.9565, + "step": 379 + }, + { + "epoch": 0.1392451447416636, + "grad_norm": 0.994335949420929, + "learning_rate": 1.9100025261438798e-05, + "loss": 0.9429, + "step": 380 + }, + { + "epoch": 0.13961157933308904, + "grad_norm": 1.1278151273727417, + "learning_rate": 1.909522877702028e-05, + "loss": 0.9287, + "step": 381 + }, + { + "epoch": 0.13997801392451448, + "grad_norm": 1.0508943796157837, + "learning_rate": 1.909042015047089e-05, + "loss": 0.9015, + "step": 382 + }, + { + "epoch": 0.1403444485159399, + "grad_norm": 1.0135952234268188, + "learning_rate": 1.9085599388210148e-05, + "loss": 1.0046, + "step": 383 + }, + { + "epoch": 0.14071088310736535, + "grad_norm": 1.016960859298706, + "learning_rate": 1.908076649667377e-05, + "loss": 1.0349, + "step": 384 + }, + { + "epoch": 0.14107731769879076, + "grad_norm": 1.0783015489578247, + "learning_rate": 1.9075921482313668e-05, + "loss": 0.9434, + "step": 385 + }, + { + "epoch": 0.1414437522902162, + "grad_norm": 1.0530004501342773, + "learning_rate": 1.907106435159793e-05, + "loss": 0.8487, + "step": 386 + }, + { + "epoch": 0.14181018688164163, + "grad_norm": 1.109596848487854, + "learning_rate": 1.9066195111010834e-05, + "loss": 0.9367, + "step": 387 + }, + { + "epoch": 0.14217662147306706, + "grad_norm": 1.0377064943313599, + "learning_rate": 1.9061313767052815e-05, + "loss": 1.0746, + "step": 388 + }, + { + "epoch": 0.1425430560644925, + "grad_norm": 1.1181172132492065, + "learning_rate": 1.9056420326240466e-05, + "loss": 0.9222, + "step": 389 + }, + { + "epoch": 0.14290949065591793, + "grad_norm": 1.0245875120162964, + "learning_rate": 1.9051514795106528e-05, + "loss": 0.9345, + "step": 390 + }, + { + "epoch": 0.14327592524734334, + "grad_norm": 1.1641144752502441, + "learning_rate": 1.904659718019989e-05, + "loss": 0.9257, + "step": 391 + }, + { + "epoch": 0.14364235983876877, + "grad_norm": 1.039544701576233, + "learning_rate": 1.9041667488085566e-05, + "loss": 0.9521, + "step": 392 + }, + { + "epoch": 0.1440087944301942, + "grad_norm": 1.0379778146743774, + "learning_rate": 1.9036725725344693e-05, + "loss": 0.9192, + "step": 393 + }, + { + "epoch": 0.14437522902161964, + "grad_norm": 1.2251654863357544, + "learning_rate": 1.903177189857453e-05, + "loss": 0.8815, + "step": 394 + }, + { + "epoch": 0.14474166361304508, + "grad_norm": 1.0674530267715454, + "learning_rate": 1.902680601438843e-05, + "loss": 0.9159, + "step": 395 + }, + { + "epoch": 0.14510809820447051, + "grad_norm": 1.0083818435668945, + "learning_rate": 1.9021828079415852e-05, + "loss": 0.8784, + "step": 396 + }, + { + "epoch": 0.14547453279589592, + "grad_norm": 0.8914971947669983, + "learning_rate": 1.9016838100302336e-05, + "loss": 0.5819, + "step": 397 + }, + { + "epoch": 0.14584096738732136, + "grad_norm": 1.086569905281067, + "learning_rate": 1.9011836083709513e-05, + "loss": 0.9501, + "step": 398 + }, + { + "epoch": 0.1462074019787468, + "grad_norm": 1.1220741271972656, + "learning_rate": 1.9006822036315066e-05, + "loss": 0.8849, + "step": 399 + }, + { + "epoch": 0.14657383657017223, + "grad_norm": 1.1456749439239502, + "learning_rate": 1.900179596481275e-05, + "loss": 0.9831, + "step": 400 + }, + { + "epoch": 0.14694027116159766, + "grad_norm": 1.1781152486801147, + "learning_rate": 1.899675787591238e-05, + "loss": 0.9056, + "step": 401 + }, + { + "epoch": 0.1473067057530231, + "grad_norm": 1.0729511976242065, + "learning_rate": 1.899170777633979e-05, + "loss": 0.9269, + "step": 402 + }, + { + "epoch": 0.1476731403444485, + "grad_norm": 1.3148356676101685, + "learning_rate": 1.8986645672836873e-05, + "loss": 0.8364, + "step": 403 + }, + { + "epoch": 0.14803957493587394, + "grad_norm": 1.1030545234680176, + "learning_rate": 1.898157157216154e-05, + "loss": 0.9535, + "step": 404 + }, + { + "epoch": 0.14840600952729938, + "grad_norm": 1.0783032178878784, + "learning_rate": 1.8976485481087707e-05, + "loss": 0.9164, + "step": 405 + }, + { + "epoch": 0.1487724441187248, + "grad_norm": 1.2315967082977295, + "learning_rate": 1.8971387406405307e-05, + "loss": 0.864, + "step": 406 + }, + { + "epoch": 0.14913887871015025, + "grad_norm": 1.0555812120437622, + "learning_rate": 1.8966277354920278e-05, + "loss": 0.9607, + "step": 407 + }, + { + "epoch": 0.14950531330157568, + "grad_norm": 1.1461411714553833, + "learning_rate": 1.8961155333454534e-05, + "loss": 0.9076, + "step": 408 + }, + { + "epoch": 0.1498717478930011, + "grad_norm": 1.1050230264663696, + "learning_rate": 1.895602134884597e-05, + "loss": 1.0034, + "step": 409 + }, + { + "epoch": 0.15023818248442652, + "grad_norm": 1.09053373336792, + "learning_rate": 1.8950875407948462e-05, + "loss": 0.9454, + "step": 410 + }, + { + "epoch": 0.15060461707585196, + "grad_norm": 1.027106761932373, + "learning_rate": 1.8945717517631833e-05, + "loss": 0.8855, + "step": 411 + }, + { + "epoch": 0.1509710516672774, + "grad_norm": 0.9812170267105103, + "learning_rate": 1.894054768478188e-05, + "loss": 0.885, + "step": 412 + }, + { + "epoch": 0.15133748625870283, + "grad_norm": 0.9762282967567444, + "learning_rate": 1.893536591630032e-05, + "loss": 0.9308, + "step": 413 + }, + { + "epoch": 0.15170392085012827, + "grad_norm": 1.1730345487594604, + "learning_rate": 1.8930172219104815e-05, + "loss": 0.8527, + "step": 414 + }, + { + "epoch": 0.15207035544155367, + "grad_norm": 1.147405982017517, + "learning_rate": 1.8924966600128955e-05, + "loss": 0.9975, + "step": 415 + }, + { + "epoch": 0.1524367900329791, + "grad_norm": 1.1086004972457886, + "learning_rate": 1.8919749066322238e-05, + "loss": 0.9845, + "step": 416 + }, + { + "epoch": 0.15280322462440454, + "grad_norm": 1.0477776527404785, + "learning_rate": 1.8914519624650073e-05, + "loss": 0.9967, + "step": 417 + }, + { + "epoch": 0.15316965921582998, + "grad_norm": 1.0343297719955444, + "learning_rate": 1.8909278282093768e-05, + "loss": 1.0273, + "step": 418 + }, + { + "epoch": 0.1535360938072554, + "grad_norm": 1.105607271194458, + "learning_rate": 1.890402504565051e-05, + "loss": 0.9371, + "step": 419 + }, + { + "epoch": 0.15390252839868085, + "grad_norm": 1.175635814666748, + "learning_rate": 1.8898759922333374e-05, + "loss": 0.986, + "step": 420 + }, + { + "epoch": 0.15426896299010626, + "grad_norm": 1.0726004838943481, + "learning_rate": 1.88934829191713e-05, + "loss": 0.9664, + "step": 421 + }, + { + "epoch": 0.1546353975815317, + "grad_norm": 1.0392824411392212, + "learning_rate": 1.8888194043209082e-05, + "loss": 0.9196, + "step": 422 + }, + { + "epoch": 0.15500183217295713, + "grad_norm": 1.1240150928497314, + "learning_rate": 1.888289330150738e-05, + "loss": 0.92, + "step": 423 + }, + { + "epoch": 0.15536826676438256, + "grad_norm": 1.0844699144363403, + "learning_rate": 1.887758070114267e-05, + "loss": 0.9914, + "step": 424 + }, + { + "epoch": 0.155734701355808, + "grad_norm": 1.0138442516326904, + "learning_rate": 1.887225624920729e-05, + "loss": 0.9292, + "step": 425 + }, + { + "epoch": 0.15610113594723343, + "grad_norm": 1.0808322429656982, + "learning_rate": 1.8866919952809374e-05, + "loss": 0.8664, + "step": 426 + }, + { + "epoch": 0.15646757053865884, + "grad_norm": 0.9856568574905396, + "learning_rate": 1.8861571819072874e-05, + "loss": 0.9507, + "step": 427 + }, + { + "epoch": 0.15683400513008428, + "grad_norm": 1.2238532304763794, + "learning_rate": 1.885621185513756e-05, + "loss": 0.8829, + "step": 428 + }, + { + "epoch": 0.1572004397215097, + "grad_norm": 1.194239854812622, + "learning_rate": 1.8850840068158975e-05, + "loss": 1.0353, + "step": 429 + }, + { + "epoch": 0.15756687431293515, + "grad_norm": 1.12733793258667, + "learning_rate": 1.884545646530846e-05, + "loss": 0.903, + "step": 430 + }, + { + "epoch": 0.15793330890436058, + "grad_norm": 1.1467331647872925, + "learning_rate": 1.8840061053773124e-05, + "loss": 0.9051, + "step": 431 + }, + { + "epoch": 0.158299743495786, + "grad_norm": 1.0669034719467163, + "learning_rate": 1.883465384075584e-05, + "loss": 0.8962, + "step": 432 + }, + { + "epoch": 0.15866617808721142, + "grad_norm": 1.2676420211791992, + "learning_rate": 1.882923483347524e-05, + "loss": 0.965, + "step": 433 + }, + { + "epoch": 0.15903261267863686, + "grad_norm": 1.1339911222457886, + "learning_rate": 1.8823804039165697e-05, + "loss": 0.9037, + "step": 434 + }, + { + "epoch": 0.1593990472700623, + "grad_norm": 1.0925798416137695, + "learning_rate": 1.881836146507732e-05, + "loss": 0.9377, + "step": 435 + }, + { + "epoch": 0.15976548186148773, + "grad_norm": 1.2156996726989746, + "learning_rate": 1.8812907118475952e-05, + "loss": 0.8098, + "step": 436 + }, + { + "epoch": 0.16013191645291316, + "grad_norm": 1.1337549686431885, + "learning_rate": 1.8807441006643138e-05, + "loss": 0.9229, + "step": 437 + }, + { + "epoch": 0.16049835104433857, + "grad_norm": 1.2260011434555054, + "learning_rate": 1.8801963136876146e-05, + "loss": 0.9081, + "step": 438 + }, + { + "epoch": 0.160864785635764, + "grad_norm": 1.1918498277664185, + "learning_rate": 1.8796473516487923e-05, + "loss": 0.9247, + "step": 439 + }, + { + "epoch": 0.16123122022718944, + "grad_norm": 1.2885518074035645, + "learning_rate": 1.8790972152807122e-05, + "loss": 0.9089, + "step": 440 + }, + { + "epoch": 0.16159765481861488, + "grad_norm": 1.2859281301498413, + "learning_rate": 1.8785459053178056e-05, + "loss": 0.9167, + "step": 441 + }, + { + "epoch": 0.1619640894100403, + "grad_norm": 1.0627514123916626, + "learning_rate": 1.877993422496072e-05, + "loss": 0.8814, + "step": 442 + }, + { + "epoch": 0.16233052400146575, + "grad_norm": 1.1065446138381958, + "learning_rate": 1.8774397675530754e-05, + "loss": 0.9363, + "step": 443 + }, + { + "epoch": 0.16269695859289116, + "grad_norm": 1.2442569732666016, + "learning_rate": 1.8768849412279456e-05, + "loss": 0.9993, + "step": 444 + }, + { + "epoch": 0.1630633931843166, + "grad_norm": 0.9614419341087341, + "learning_rate": 1.8763289442613753e-05, + "loss": 0.9399, + "step": 445 + }, + { + "epoch": 0.16342982777574203, + "grad_norm": 1.0689506530761719, + "learning_rate": 1.875771777395621e-05, + "loss": 0.9789, + "step": 446 + }, + { + "epoch": 0.16379626236716746, + "grad_norm": 1.1324421167373657, + "learning_rate": 1.8752134413745003e-05, + "loss": 0.9286, + "step": 447 + }, + { + "epoch": 0.1641626969585929, + "grad_norm": 1.3980520963668823, + "learning_rate": 1.874653936943392e-05, + "loss": 0.8706, + "step": 448 + }, + { + "epoch": 0.16452913155001833, + "grad_norm": 1.0804977416992188, + "learning_rate": 1.8740932648492337e-05, + "loss": 0.9385, + "step": 449 + }, + { + "epoch": 0.16489556614144374, + "grad_norm": 1.0773653984069824, + "learning_rate": 1.8735314258405237e-05, + "loss": 0.9025, + "step": 450 + }, + { + "epoch": 0.16526200073286917, + "grad_norm": 1.056978464126587, + "learning_rate": 1.8729684206673167e-05, + "loss": 0.9076, + "step": 451 + }, + { + "epoch": 0.1656284353242946, + "grad_norm": 1.0701828002929688, + "learning_rate": 1.872404250081225e-05, + "loss": 0.9008, + "step": 452 + }, + { + "epoch": 0.16599486991572004, + "grad_norm": 1.309175729751587, + "learning_rate": 1.871838914835415e-05, + "loss": 0.9097, + "step": 453 + }, + { + "epoch": 0.16636130450714548, + "grad_norm": 0.9885631203651428, + "learning_rate": 1.871272415684611e-05, + "loss": 0.9232, + "step": 454 + }, + { + "epoch": 0.16672773909857092, + "grad_norm": 1.313955307006836, + "learning_rate": 1.8707047533850885e-05, + "loss": 0.7769, + "step": 455 + }, + { + "epoch": 0.16709417368999632, + "grad_norm": 1.122620701789856, + "learning_rate": 1.870135928694677e-05, + "loss": 0.893, + "step": 456 + }, + { + "epoch": 0.16746060828142176, + "grad_norm": 1.1263620853424072, + "learning_rate": 1.8695659423727574e-05, + "loss": 0.9634, + "step": 457 + }, + { + "epoch": 0.1678270428728472, + "grad_norm": 1.491286277770996, + "learning_rate": 1.8689947951802616e-05, + "loss": 0.7905, + "step": 458 + }, + { + "epoch": 0.16819347746427263, + "grad_norm": 1.1821624040603638, + "learning_rate": 1.868422487879671e-05, + "loss": 0.9576, + "step": 459 + }, + { + "epoch": 0.16855991205569806, + "grad_norm": 1.2670471668243408, + "learning_rate": 1.8678490212350154e-05, + "loss": 0.9215, + "step": 460 + }, + { + "epoch": 0.1689263466471235, + "grad_norm": 1.2155308723449707, + "learning_rate": 1.8672743960118733e-05, + "loss": 0.9208, + "step": 461 + }, + { + "epoch": 0.1692927812385489, + "grad_norm": 1.062874436378479, + "learning_rate": 1.8666986129773695e-05, + "loss": 0.94, + "step": 462 + }, + { + "epoch": 0.16965921582997434, + "grad_norm": 0.9979032874107361, + "learning_rate": 1.8661216729001743e-05, + "loss": 0.9396, + "step": 463 + }, + { + "epoch": 0.17002565042139978, + "grad_norm": 0.9946810007095337, + "learning_rate": 1.8655435765505023e-05, + "loss": 0.9787, + "step": 464 + }, + { + "epoch": 0.1703920850128252, + "grad_norm": 1.2013760805130005, + "learning_rate": 1.864964324700113e-05, + "loss": 0.9518, + "step": 465 + }, + { + "epoch": 0.17075851960425065, + "grad_norm": 1.1290384531021118, + "learning_rate": 1.8643839181223068e-05, + "loss": 0.9795, + "step": 466 + }, + { + "epoch": 0.17112495419567608, + "grad_norm": 1.229293704032898, + "learning_rate": 1.863802357591927e-05, + "loss": 0.9091, + "step": 467 + }, + { + "epoch": 0.1714913887871015, + "grad_norm": 1.096228003501892, + "learning_rate": 1.8632196438853567e-05, + "loss": 0.952, + "step": 468 + }, + { + "epoch": 0.17185782337852692, + "grad_norm": 1.2736438512802124, + "learning_rate": 1.862635777780519e-05, + "loss": 0.941, + "step": 469 + }, + { + "epoch": 0.17222425796995236, + "grad_norm": 1.2913713455200195, + "learning_rate": 1.862050760056875e-05, + "loss": 0.8991, + "step": 470 + }, + { + "epoch": 0.1725906925613778, + "grad_norm": 1.2449238300323486, + "learning_rate": 1.861464591495423e-05, + "loss": 0.9098, + "step": 471 + }, + { + "epoch": 0.17295712715280323, + "grad_norm": 1.1578012704849243, + "learning_rate": 1.860877272878699e-05, + "loss": 0.8868, + "step": 472 + }, + { + "epoch": 0.17332356174422867, + "grad_norm": 1.0383613109588623, + "learning_rate": 1.8602888049907725e-05, + "loss": 0.9155, + "step": 473 + }, + { + "epoch": 0.17368999633565407, + "grad_norm": 1.028679370880127, + "learning_rate": 1.859699188617249e-05, + "loss": 0.9295, + "step": 474 + }, + { + "epoch": 0.1740564309270795, + "grad_norm": 1.1471338272094727, + "learning_rate": 1.8591084245452654e-05, + "loss": 0.8889, + "step": 475 + }, + { + "epoch": 0.17442286551850494, + "grad_norm": 1.1970279216766357, + "learning_rate": 1.8585165135634926e-05, + "loss": 0.8554, + "step": 476 + }, + { + "epoch": 0.17478930010993038, + "grad_norm": 1.0609773397445679, + "learning_rate": 1.857923456462131e-05, + "loss": 0.8865, + "step": 477 + }, + { + "epoch": 0.17515573470135581, + "grad_norm": 1.135913610458374, + "learning_rate": 1.8573292540329126e-05, + "loss": 0.9691, + "step": 478 + }, + { + "epoch": 0.17552216929278125, + "grad_norm": 1.1295160055160522, + "learning_rate": 1.8567339070690972e-05, + "loss": 0.9278, + "step": 479 + }, + { + "epoch": 0.17588860388420666, + "grad_norm": 1.269704818725586, + "learning_rate": 1.856137416365473e-05, + "loss": 0.8976, + "step": 480 + }, + { + "epoch": 0.1762550384756321, + "grad_norm": 1.2077471017837524, + "learning_rate": 1.8555397827183557e-05, + "loss": 0.8924, + "step": 481 + }, + { + "epoch": 0.17662147306705753, + "grad_norm": 1.1305952072143555, + "learning_rate": 1.8549410069255863e-05, + "loss": 0.9248, + "step": 482 + }, + { + "epoch": 0.17698790765848296, + "grad_norm": 1.317244529724121, + "learning_rate": 1.8543410897865295e-05, + "loss": 0.8066, + "step": 483 + }, + { + "epoch": 0.1773543422499084, + "grad_norm": 1.138925552368164, + "learning_rate": 1.8537400321020757e-05, + "loss": 0.9526, + "step": 484 + }, + { + "epoch": 0.17772077684133383, + "grad_norm": 1.3495008945465088, + "learning_rate": 1.853137834674636e-05, + "loss": 0.8081, + "step": 485 + }, + { + "epoch": 0.17808721143275924, + "grad_norm": 1.4174842834472656, + "learning_rate": 1.852534498308145e-05, + "loss": 0.7942, + "step": 486 + }, + { + "epoch": 0.17845364602418468, + "grad_norm": 1.2117899656295776, + "learning_rate": 1.851930023808056e-05, + "loss": 0.9396, + "step": 487 + }, + { + "epoch": 0.1788200806156101, + "grad_norm": 1.273874282836914, + "learning_rate": 1.8513244119813428e-05, + "loss": 0.9593, + "step": 488 + }, + { + "epoch": 0.17918651520703555, + "grad_norm": 1.2929598093032837, + "learning_rate": 1.8507176636364973e-05, + "loss": 0.9401, + "step": 489 + }, + { + "epoch": 0.17955294979846098, + "grad_norm": 1.0998018980026245, + "learning_rate": 1.8501097795835284e-05, + "loss": 0.9874, + "step": 490 + }, + { + "epoch": 0.17991938438988642, + "grad_norm": 1.18282949924469, + "learning_rate": 1.8495007606339615e-05, + "loss": 0.9393, + "step": 491 + }, + { + "epoch": 0.18028581898131182, + "grad_norm": 1.1368458271026611, + "learning_rate": 1.8488906076008368e-05, + "loss": 0.9364, + "step": 492 + }, + { + "epoch": 0.18065225357273726, + "grad_norm": 1.1537795066833496, + "learning_rate": 1.8482793212987088e-05, + "loss": 0.9196, + "step": 493 + }, + { + "epoch": 0.1810186881641627, + "grad_norm": 1.2160780429840088, + "learning_rate": 1.8476669025436448e-05, + "loss": 0.8206, + "step": 494 + }, + { + "epoch": 0.18138512275558813, + "grad_norm": 1.1482170820236206, + "learning_rate": 1.8470533521532235e-05, + "loss": 0.9044, + "step": 495 + }, + { + "epoch": 0.18175155734701356, + "grad_norm": 1.2820298671722412, + "learning_rate": 1.846438670946535e-05, + "loss": 0.9536, + "step": 496 + }, + { + "epoch": 0.182117991938439, + "grad_norm": 1.358842134475708, + "learning_rate": 1.8458228597441783e-05, + "loss": 0.7656, + "step": 497 + }, + { + "epoch": 0.1824844265298644, + "grad_norm": 1.266168236732483, + "learning_rate": 1.8452059193682622e-05, + "loss": 0.9818, + "step": 498 + }, + { + "epoch": 0.18285086112128984, + "grad_norm": 1.104447841644287, + "learning_rate": 1.8445878506424013e-05, + "loss": 0.9381, + "step": 499 + }, + { + "epoch": 0.18321729571271528, + "grad_norm": 1.1257696151733398, + "learning_rate": 1.843968654391718e-05, + "loss": 0.8861, + "step": 500 + }, + { + "epoch": 0.1835837303041407, + "grad_norm": 1.0890069007873535, + "learning_rate": 1.843348331442839e-05, + "loss": 0.9278, + "step": 501 + }, + { + "epoch": 0.18395016489556615, + "grad_norm": 1.0580757856369019, + "learning_rate": 1.8427268826238954e-05, + "loss": 0.9762, + "step": 502 + }, + { + "epoch": 0.18431659948699158, + "grad_norm": 1.441726565361023, + "learning_rate": 1.8421043087645216e-05, + "loss": 0.8981, + "step": 503 + }, + { + "epoch": 0.184683034078417, + "grad_norm": 1.272265076637268, + "learning_rate": 1.8414806106958535e-05, + "loss": 0.9042, + "step": 504 + }, + { + "epoch": 0.18504946866984243, + "grad_norm": 1.1588517427444458, + "learning_rate": 1.8408557892505285e-05, + "loss": 0.9226, + "step": 505 + }, + { + "epoch": 0.18541590326126786, + "grad_norm": 1.3338249921798706, + "learning_rate": 1.840229845262683e-05, + "loss": 0.9488, + "step": 506 + }, + { + "epoch": 0.1857823378526933, + "grad_norm": 1.1822744607925415, + "learning_rate": 1.839602779567952e-05, + "loss": 0.8742, + "step": 507 + }, + { + "epoch": 0.18614877244411873, + "grad_norm": 1.1773830652236938, + "learning_rate": 1.8389745930034686e-05, + "loss": 0.9124, + "step": 508 + }, + { + "epoch": 0.18651520703554417, + "grad_norm": 1.115365982055664, + "learning_rate": 1.8383452864078618e-05, + "loss": 0.8988, + "step": 509 + }, + { + "epoch": 0.18688164162696957, + "grad_norm": 1.1186784505844116, + "learning_rate": 1.8377148606212558e-05, + "loss": 0.903, + "step": 510 + }, + { + "epoch": 0.187248076218395, + "grad_norm": 1.2818164825439453, + "learning_rate": 1.8370833164852694e-05, + "loss": 0.8521, + "step": 511 + }, + { + "epoch": 0.18761451080982045, + "grad_norm": 1.0865288972854614, + "learning_rate": 1.8364506548430133e-05, + "loss": 0.905, + "step": 512 + }, + { + "epoch": 0.18798094540124588, + "grad_norm": 1.1884108781814575, + "learning_rate": 1.835816876539092e-05, + "loss": 0.9714, + "step": 513 + }, + { + "epoch": 0.18834737999267132, + "grad_norm": 1.1109466552734375, + "learning_rate": 1.8351819824195988e-05, + "loss": 0.9337, + "step": 514 + }, + { + "epoch": 0.18871381458409675, + "grad_norm": 1.3398234844207764, + "learning_rate": 1.834545973332117e-05, + "loss": 1.011, + "step": 515 + }, + { + "epoch": 0.18908024917552216, + "grad_norm": 1.2325025796890259, + "learning_rate": 1.8339088501257194e-05, + "loss": 0.9101, + "step": 516 + }, + { + "epoch": 0.1894466837669476, + "grad_norm": 1.158435344696045, + "learning_rate": 1.8332706136509654e-05, + "loss": 0.9724, + "step": 517 + }, + { + "epoch": 0.18981311835837303, + "grad_norm": 1.3573687076568604, + "learning_rate": 1.832631264759901e-05, + "loss": 0.9551, + "step": 518 + }, + { + "epoch": 0.19017955294979846, + "grad_norm": 1.2095463275909424, + "learning_rate": 1.8319908043060562e-05, + "loss": 0.8611, + "step": 519 + }, + { + "epoch": 0.1905459875412239, + "grad_norm": 1.2500462532043457, + "learning_rate": 1.8313492331444468e-05, + "loss": 0.8936, + "step": 520 + }, + { + "epoch": 0.19091242213264933, + "grad_norm": 1.3058297634124756, + "learning_rate": 1.830706552131569e-05, + "loss": 0.9843, + "step": 521 + }, + { + "epoch": 0.19127885672407474, + "grad_norm": 1.47245454788208, + "learning_rate": 1.8300627621254038e-05, + "loss": 0.7822, + "step": 522 + }, + { + "epoch": 0.19164529131550018, + "grad_norm": 1.1628376245498657, + "learning_rate": 1.829417863985409e-05, + "loss": 0.9191, + "step": 523 + }, + { + "epoch": 0.1920117259069256, + "grad_norm": 1.1671644449234009, + "learning_rate": 1.828771858572525e-05, + "loss": 0.8991, + "step": 524 + }, + { + "epoch": 0.19237816049835105, + "grad_norm": 1.2752845287322998, + "learning_rate": 1.828124746749168e-05, + "loss": 0.844, + "step": 525 + }, + { + "epoch": 0.19274459508977648, + "grad_norm": 1.2671666145324707, + "learning_rate": 1.8274765293792328e-05, + "loss": 0.7955, + "step": 526 + }, + { + "epoch": 0.19311102968120192, + "grad_norm": 1.3220008611679077, + "learning_rate": 1.8268272073280902e-05, + "loss": 0.8403, + "step": 527 + }, + { + "epoch": 0.19347746427262733, + "grad_norm": 1.256608009338379, + "learning_rate": 1.8261767814625844e-05, + "loss": 0.9375, + "step": 528 + }, + { + "epoch": 0.19384389886405276, + "grad_norm": 1.2269988059997559, + "learning_rate": 1.8255252526510344e-05, + "loss": 0.9036, + "step": 529 + }, + { + "epoch": 0.1942103334554782, + "grad_norm": 1.3886499404907227, + "learning_rate": 1.824872621763231e-05, + "loss": 0.9434, + "step": 530 + }, + { + "epoch": 0.19457676804690363, + "grad_norm": 1.1359144449234009, + "learning_rate": 1.8242188896704362e-05, + "loss": 0.9783, + "step": 531 + }, + { + "epoch": 0.19494320263832907, + "grad_norm": 1.2621866464614868, + "learning_rate": 1.823564057245383e-05, + "loss": 0.9, + "step": 532 + }, + { + "epoch": 0.1953096372297545, + "grad_norm": 1.1670093536376953, + "learning_rate": 1.8229081253622718e-05, + "loss": 0.9509, + "step": 533 + }, + { + "epoch": 0.1956760718211799, + "grad_norm": 1.209097146987915, + "learning_rate": 1.8222510948967726e-05, + "loss": 0.9837, + "step": 534 + }, + { + "epoch": 0.19604250641260534, + "grad_norm": 1.249265193939209, + "learning_rate": 1.8215929667260208e-05, + "loss": 0.8725, + "step": 535 + }, + { + "epoch": 0.19640894100403078, + "grad_norm": 1.1079298257827759, + "learning_rate": 1.8209337417286173e-05, + "loss": 0.9533, + "step": 536 + }, + { + "epoch": 0.19677537559545621, + "grad_norm": 1.3172224760055542, + "learning_rate": 1.8202734207846284e-05, + "loss": 0.8488, + "step": 537 + }, + { + "epoch": 0.19714181018688165, + "grad_norm": 1.2152198553085327, + "learning_rate": 1.819612004775581e-05, + "loss": 0.9447, + "step": 538 + }, + { + "epoch": 0.19750824477830708, + "grad_norm": 1.367629885673523, + "learning_rate": 1.8189494945844673e-05, + "loss": 0.8947, + "step": 539 + }, + { + "epoch": 0.1978746793697325, + "grad_norm": 1.0554075241088867, + "learning_rate": 1.818285891095737e-05, + "loss": 0.9616, + "step": 540 + }, + { + "epoch": 0.19824111396115793, + "grad_norm": 1.3592339754104614, + "learning_rate": 1.8176211951953015e-05, + "loss": 0.7161, + "step": 541 + }, + { + "epoch": 0.19860754855258336, + "grad_norm": 1.1595011949539185, + "learning_rate": 1.8169554077705295e-05, + "loss": 0.9479, + "step": 542 + }, + { + "epoch": 0.1989739831440088, + "grad_norm": 1.3406450748443604, + "learning_rate": 1.8162885297102475e-05, + "loss": 0.8909, + "step": 543 + }, + { + "epoch": 0.19934041773543423, + "grad_norm": 1.242037296295166, + "learning_rate": 1.815620561904738e-05, + "loss": 0.9435, + "step": 544 + }, + { + "epoch": 0.19970685232685967, + "grad_norm": 1.229488492012024, + "learning_rate": 1.8149515052457375e-05, + "loss": 0.8382, + "step": 545 + }, + { + "epoch": 0.20007328691828508, + "grad_norm": 1.371599793434143, + "learning_rate": 1.814281360626437e-05, + "loss": 0.8936, + "step": 546 + }, + { + "epoch": 0.2004397215097105, + "grad_norm": 1.2121071815490723, + "learning_rate": 1.8136101289414797e-05, + "loss": 0.8484, + "step": 547 + }, + { + "epoch": 0.20080615610113595, + "grad_norm": 1.1973918676376343, + "learning_rate": 1.8129378110869592e-05, + "loss": 0.9279, + "step": 548 + }, + { + "epoch": 0.20117259069256138, + "grad_norm": 1.1278659105300903, + "learning_rate": 1.812264407960421e-05, + "loss": 0.8555, + "step": 549 + }, + { + "epoch": 0.20153902528398682, + "grad_norm": 1.1396849155426025, + "learning_rate": 1.8115899204608575e-05, + "loss": 0.8995, + "step": 550 + }, + { + "epoch": 0.20190545987541225, + "grad_norm": 1.102533221244812, + "learning_rate": 1.8109143494887097e-05, + "loss": 0.901, + "step": 551 + }, + { + "epoch": 0.20227189446683766, + "grad_norm": 1.2453352212905884, + "learning_rate": 1.810237695945865e-05, + "loss": 0.8631, + "step": 552 + }, + { + "epoch": 0.2026383290582631, + "grad_norm": 1.165541172027588, + "learning_rate": 1.8095599607356556e-05, + "loss": 0.9209, + "step": 553 + }, + { + "epoch": 0.20300476364968853, + "grad_norm": 1.2267589569091797, + "learning_rate": 1.808881144762859e-05, + "loss": 0.857, + "step": 554 + }, + { + "epoch": 0.20337119824111397, + "grad_norm": 1.1667150259017944, + "learning_rate": 1.8082012489336934e-05, + "loss": 0.9138, + "step": 555 + }, + { + "epoch": 0.2037376328325394, + "grad_norm": 1.3724040985107422, + "learning_rate": 1.80752027415582e-05, + "loss": 0.8979, + "step": 556 + }, + { + "epoch": 0.20410406742396484, + "grad_norm": 1.4671728610992432, + "learning_rate": 1.806838221338341e-05, + "loss": 0.8206, + "step": 557 + }, + { + "epoch": 0.20447050201539024, + "grad_norm": 1.176984429359436, + "learning_rate": 1.806155091391796e-05, + "loss": 0.9374, + "step": 558 + }, + { + "epoch": 0.20483693660681568, + "grad_norm": 1.1816294193267822, + "learning_rate": 1.8054708852281638e-05, + "loss": 0.9485, + "step": 559 + }, + { + "epoch": 0.2052033711982411, + "grad_norm": 1.2052361965179443, + "learning_rate": 1.80478560376086e-05, + "loss": 0.9411, + "step": 560 + }, + { + "epoch": 0.20556980578966655, + "grad_norm": 1.2356535196304321, + "learning_rate": 1.804099247904735e-05, + "loss": 0.9236, + "step": 561 + }, + { + "epoch": 0.20593624038109198, + "grad_norm": 1.0646589994430542, + "learning_rate": 1.803411818576074e-05, + "loss": 0.931, + "step": 562 + }, + { + "epoch": 0.20630267497251742, + "grad_norm": 1.207443118095398, + "learning_rate": 1.802723316692595e-05, + "loss": 0.9101, + "step": 563 + }, + { + "epoch": 0.20666910956394283, + "grad_norm": 1.25215744972229, + "learning_rate": 1.8020337431734484e-05, + "loss": 0.9273, + "step": 564 + }, + { + "epoch": 0.20703554415536826, + "grad_norm": 1.1377441883087158, + "learning_rate": 1.8013430989392146e-05, + "loss": 0.857, + "step": 565 + }, + { + "epoch": 0.2074019787467937, + "grad_norm": 1.185183048248291, + "learning_rate": 1.800651384911904e-05, + "loss": 0.9233, + "step": 566 + }, + { + "epoch": 0.20776841333821913, + "grad_norm": 1.4221135377883911, + "learning_rate": 1.7999586020149546e-05, + "loss": 0.8213, + "step": 567 + }, + { + "epoch": 0.20813484792964457, + "grad_norm": 1.2080090045928955, + "learning_rate": 1.799264751173232e-05, + "loss": 0.8868, + "step": 568 + }, + { + "epoch": 0.20850128252107, + "grad_norm": 1.3872716426849365, + "learning_rate": 1.7985698333130263e-05, + "loss": 0.8625, + "step": 569 + }, + { + "epoch": 0.2088677171124954, + "grad_norm": 1.171733021736145, + "learning_rate": 1.7978738493620534e-05, + "loss": 0.9601, + "step": 570 + }, + { + "epoch": 0.20923415170392085, + "grad_norm": 1.206907033920288, + "learning_rate": 1.797176800249452e-05, + "loss": 0.9184, + "step": 571 + }, + { + "epoch": 0.20960058629534628, + "grad_norm": 1.104447603225708, + "learning_rate": 1.7964786869057826e-05, + "loss": 0.9092, + "step": 572 + }, + { + "epoch": 0.20996702088677172, + "grad_norm": 1.2680176496505737, + "learning_rate": 1.7957795102630265e-05, + "loss": 0.9294, + "step": 573 + }, + { + "epoch": 0.21033345547819715, + "grad_norm": 1.1666079759597778, + "learning_rate": 1.7950792712545846e-05, + "loss": 0.9258, + "step": 574 + }, + { + "epoch": 0.21069989006962256, + "grad_norm": 1.1464416980743408, + "learning_rate": 1.7943779708152763e-05, + "loss": 0.9132, + "step": 575 + }, + { + "epoch": 0.211066324661048, + "grad_norm": 1.243110179901123, + "learning_rate": 1.793675609881337e-05, + "loss": 0.8997, + "step": 576 + }, + { + "epoch": 0.21143275925247343, + "grad_norm": 1.3148272037506104, + "learning_rate": 1.792972189390419e-05, + "loss": 0.8708, + "step": 577 + }, + { + "epoch": 0.21179919384389886, + "grad_norm": 1.1587334871292114, + "learning_rate": 1.7922677102815886e-05, + "loss": 0.9239, + "step": 578 + }, + { + "epoch": 0.2121656284353243, + "grad_norm": 1.2452987432479858, + "learning_rate": 1.7915621734953255e-05, + "loss": 0.9119, + "step": 579 + }, + { + "epoch": 0.21253206302674973, + "grad_norm": 1.1972370147705078, + "learning_rate": 1.7908555799735217e-05, + "loss": 0.9042, + "step": 580 + }, + { + "epoch": 0.21289849761817514, + "grad_norm": 1.3831322193145752, + "learning_rate": 1.7901479306594794e-05, + "loss": 0.9228, + "step": 581 + }, + { + "epoch": 0.21326493220960058, + "grad_norm": 1.3178349733352661, + "learning_rate": 1.78943922649791e-05, + "loss": 0.8248, + "step": 582 + }, + { + "epoch": 0.213631366801026, + "grad_norm": 1.0427035093307495, + "learning_rate": 1.788729468434934e-05, + "loss": 0.8347, + "step": 583 + }, + { + "epoch": 0.21399780139245145, + "grad_norm": 1.55295991897583, + "learning_rate": 1.7880186574180782e-05, + "loss": 0.9322, + "step": 584 + }, + { + "epoch": 0.21436423598387688, + "grad_norm": 1.244181513786316, + "learning_rate": 1.7873067943962756e-05, + "loss": 0.8777, + "step": 585 + }, + { + "epoch": 0.21473067057530232, + "grad_norm": 1.3763676881790161, + "learning_rate": 1.7865938803198635e-05, + "loss": 0.9281, + "step": 586 + }, + { + "epoch": 0.21509710516672773, + "grad_norm": 1.5503166913986206, + "learning_rate": 1.7858799161405818e-05, + "loss": 0.9044, + "step": 587 + }, + { + "epoch": 0.21546353975815316, + "grad_norm": 1.391393780708313, + "learning_rate": 1.7851649028115728e-05, + "loss": 0.8613, + "step": 588 + }, + { + "epoch": 0.2158299743495786, + "grad_norm": 1.203378677368164, + "learning_rate": 1.7844488412873798e-05, + "loss": 0.84, + "step": 589 + }, + { + "epoch": 0.21619640894100403, + "grad_norm": 1.3878751993179321, + "learning_rate": 1.783731732523944e-05, + "loss": 0.9575, + "step": 590 + }, + { + "epoch": 0.21656284353242947, + "grad_norm": 1.523260235786438, + "learning_rate": 1.783013577478607e-05, + "loss": 0.88, + "step": 591 + }, + { + "epoch": 0.2169292781238549, + "grad_norm": 1.2326833009719849, + "learning_rate": 1.7822943771101047e-05, + "loss": 0.8595, + "step": 592 + }, + { + "epoch": 0.2172957127152803, + "grad_norm": 1.4009530544281006, + "learning_rate": 1.78157413237857e-05, + "loss": 0.8396, + "step": 593 + }, + { + "epoch": 0.21766214730670574, + "grad_norm": 1.1550780534744263, + "learning_rate": 1.78085284424553e-05, + "loss": 0.903, + "step": 594 + }, + { + "epoch": 0.21802858189813118, + "grad_norm": 1.2234489917755127, + "learning_rate": 1.7801305136739038e-05, + "loss": 0.8886, + "step": 595 + }, + { + "epoch": 0.21839501648955661, + "grad_norm": 1.4011855125427246, + "learning_rate": 1.779407141628003e-05, + "loss": 0.8986, + "step": 596 + }, + { + "epoch": 0.21876145108098205, + "grad_norm": 1.4119914770126343, + "learning_rate": 1.7786827290735295e-05, + "loss": 0.8956, + "step": 597 + }, + { + "epoch": 0.21912788567240749, + "grad_norm": 1.356140375137329, + "learning_rate": 1.7779572769775738e-05, + "loss": 0.915, + "step": 598 + }, + { + "epoch": 0.2194943202638329, + "grad_norm": 1.4082356691360474, + "learning_rate": 1.777230786308614e-05, + "loss": 0.8769, + "step": 599 + }, + { + "epoch": 0.21986075485525833, + "grad_norm": 1.1782652139663696, + "learning_rate": 1.7765032580365167e-05, + "loss": 0.9401, + "step": 600 + }, + { + "epoch": 0.22022718944668376, + "grad_norm": 1.3177205324172974, + "learning_rate": 1.775774693132531e-05, + "loss": 0.8825, + "step": 601 + }, + { + "epoch": 0.2205936240381092, + "grad_norm": 1.1520065069198608, + "learning_rate": 1.775045092569291e-05, + "loss": 0.8783, + "step": 602 + }, + { + "epoch": 0.22096005862953463, + "grad_norm": 1.222068428993225, + "learning_rate": 1.774314457320814e-05, + "loss": 0.9228, + "step": 603 + }, + { + "epoch": 0.22132649322096007, + "grad_norm": 1.2122035026550293, + "learning_rate": 1.773582788362498e-05, + "loss": 0.9379, + "step": 604 + }, + { + "epoch": 0.22169292781238548, + "grad_norm": 1.2832188606262207, + "learning_rate": 1.772850086671121e-05, + "loss": 0.9291, + "step": 605 + }, + { + "epoch": 0.2220593624038109, + "grad_norm": 1.3288195133209229, + "learning_rate": 1.7721163532248397e-05, + "loss": 0.9414, + "step": 606 + }, + { + "epoch": 0.22242579699523635, + "grad_norm": 1.1746708154678345, + "learning_rate": 1.7713815890031888e-05, + "loss": 0.8746, + "step": 607 + }, + { + "epoch": 0.22279223158666178, + "grad_norm": 0.9558817744255066, + "learning_rate": 1.7706457949870777e-05, + "loss": 0.5374, + "step": 608 + }, + { + "epoch": 0.22315866617808722, + "grad_norm": 1.473559856414795, + "learning_rate": 1.7699089721587925e-05, + "loss": 0.7729, + "step": 609 + }, + { + "epoch": 0.22352510076951265, + "grad_norm": 1.2237671613693237, + "learning_rate": 1.7691711215019905e-05, + "loss": 0.9161, + "step": 610 + }, + { + "epoch": 0.22389153536093806, + "grad_norm": 1.43886399269104, + "learning_rate": 1.768432244001703e-05, + "loss": 0.88, + "step": 611 + }, + { + "epoch": 0.2242579699523635, + "grad_norm": 1.2342700958251953, + "learning_rate": 1.7676923406443318e-05, + "loss": 0.9185, + "step": 612 + }, + { + "epoch": 0.22462440454378893, + "grad_norm": 1.2138959169387817, + "learning_rate": 1.7669514124176476e-05, + "loss": 0.9151, + "step": 613 + }, + { + "epoch": 0.22499083913521437, + "grad_norm": 1.2613773345947266, + "learning_rate": 1.7662094603107894e-05, + "loss": 0.9497, + "step": 614 + }, + { + "epoch": 0.2253572737266398, + "grad_norm": 1.2515242099761963, + "learning_rate": 1.765466485314264e-05, + "loss": 0.9384, + "step": 615 + }, + { + "epoch": 0.22572370831806524, + "grad_norm": 1.3340543508529663, + "learning_rate": 1.7647224884199423e-05, + "loss": 0.9223, + "step": 616 + }, + { + "epoch": 0.22609014290949064, + "grad_norm": 1.1793503761291504, + "learning_rate": 1.7639774706210605e-05, + "loss": 0.9514, + "step": 617 + }, + { + "epoch": 0.22645657750091608, + "grad_norm": 1.1925681829452515, + "learning_rate": 1.7632314329122172e-05, + "loss": 0.9048, + "step": 618 + }, + { + "epoch": 0.2268230120923415, + "grad_norm": 1.3044099807739258, + "learning_rate": 1.7624843762893734e-05, + "loss": 0.858, + "step": 619 + }, + { + "epoch": 0.22718944668376695, + "grad_norm": 1.2997506856918335, + "learning_rate": 1.761736301749849e-05, + "loss": 0.9041, + "step": 620 + }, + { + "epoch": 0.22755588127519238, + "grad_norm": 1.2851029634475708, + "learning_rate": 1.7609872102923237e-05, + "loss": 0.9003, + "step": 621 + }, + { + "epoch": 0.22792231586661782, + "grad_norm": 1.3808014392852783, + "learning_rate": 1.760237102916835e-05, + "loss": 0.8519, + "step": 622 + }, + { + "epoch": 0.22828875045804323, + "grad_norm": 1.4921283721923828, + "learning_rate": 1.759485980624776e-05, + "loss": 0.8629, + "step": 623 + }, + { + "epoch": 0.22865518504946866, + "grad_norm": 1.1690940856933594, + "learning_rate": 1.758733844418895e-05, + "loss": 0.9156, + "step": 624 + }, + { + "epoch": 0.2290216196408941, + "grad_norm": 1.3847805261611938, + "learning_rate": 1.757980695303294e-05, + "loss": 0.8851, + "step": 625 + }, + { + "epoch": 0.22938805423231953, + "grad_norm": 1.3306790590286255, + "learning_rate": 1.757226534283427e-05, + "loss": 0.9196, + "step": 626 + }, + { + "epoch": 0.22975448882374497, + "grad_norm": 1.3025264739990234, + "learning_rate": 1.7564713623660988e-05, + "loss": 0.8605, + "step": 627 + }, + { + "epoch": 0.2301209234151704, + "grad_norm": 1.1992818117141724, + "learning_rate": 1.7557151805594647e-05, + "loss": 0.9457, + "step": 628 + }, + { + "epoch": 0.2304873580065958, + "grad_norm": 1.2719687223434448, + "learning_rate": 1.7549579898730266e-05, + "loss": 0.928, + "step": 629 + }, + { + "epoch": 0.23085379259802125, + "grad_norm": 1.1798957586288452, + "learning_rate": 1.7541997913176345e-05, + "loss": 0.9542, + "step": 630 + }, + { + "epoch": 0.23122022718944668, + "grad_norm": 1.288134217262268, + "learning_rate": 1.753440585905483e-05, + "loss": 0.9595, + "step": 631 + }, + { + "epoch": 0.23158666178087212, + "grad_norm": 1.3597036600112915, + "learning_rate": 1.7526803746501125e-05, + "loss": 0.8699, + "step": 632 + }, + { + "epoch": 0.23195309637229755, + "grad_norm": 1.2226777076721191, + "learning_rate": 1.751919158566404e-05, + "loss": 0.9487, + "step": 633 + }, + { + "epoch": 0.232319530963723, + "grad_norm": 1.3789327144622803, + "learning_rate": 1.7511569386705816e-05, + "loss": 0.8582, + "step": 634 + }, + { + "epoch": 0.2326859655551484, + "grad_norm": 1.6425267457962036, + "learning_rate": 1.7503937159802084e-05, + "loss": 0.7309, + "step": 635 + }, + { + "epoch": 0.23305240014657383, + "grad_norm": 1.3353691101074219, + "learning_rate": 1.7496294915141875e-05, + "loss": 0.9254, + "step": 636 + }, + { + "epoch": 0.23341883473799926, + "grad_norm": 1.4697070121765137, + "learning_rate": 1.7488642662927574e-05, + "loss": 0.955, + "step": 637 + }, + { + "epoch": 0.2337852693294247, + "grad_norm": 1.4180868864059448, + "learning_rate": 1.748098041337495e-05, + "loss": 0.9065, + "step": 638 + }, + { + "epoch": 0.23415170392085013, + "grad_norm": 1.406212568283081, + "learning_rate": 1.74733081767131e-05, + "loss": 0.9334, + "step": 639 + }, + { + "epoch": 0.23451813851227557, + "grad_norm": 1.3697319030761719, + "learning_rate": 1.7465625963184464e-05, + "loss": 0.8559, + "step": 640 + }, + { + "epoch": 0.23488457310370098, + "grad_norm": 1.55168616771698, + "learning_rate": 1.745793378304479e-05, + "loss": 0.8157, + "step": 641 + }, + { + "epoch": 0.2352510076951264, + "grad_norm": 1.291481375694275, + "learning_rate": 1.7450231646563148e-05, + "loss": 0.9607, + "step": 642 + }, + { + "epoch": 0.23561744228655185, + "grad_norm": 1.2459636926651, + "learning_rate": 1.744251956402189e-05, + "loss": 0.9208, + "step": 643 + }, + { + "epoch": 0.23598387687797728, + "grad_norm": 1.4352821111679077, + "learning_rate": 1.7434797545716636e-05, + "loss": 0.905, + "step": 644 + }, + { + "epoch": 0.23635031146940272, + "grad_norm": 1.3828191757202148, + "learning_rate": 1.7427065601956292e-05, + "loss": 0.8894, + "step": 645 + }, + { + "epoch": 0.23671674606082815, + "grad_norm": 1.2361446619033813, + "learning_rate": 1.7419323743063e-05, + "loss": 0.9492, + "step": 646 + }, + { + "epoch": 0.23708318065225356, + "grad_norm": 1.5865951776504517, + "learning_rate": 1.7411571979372143e-05, + "loss": 0.869, + "step": 647 + }, + { + "epoch": 0.237449615243679, + "grad_norm": 1.345902681350708, + "learning_rate": 1.7403810321232324e-05, + "loss": 0.953, + "step": 648 + }, + { + "epoch": 0.23781604983510443, + "grad_norm": 1.2080811262130737, + "learning_rate": 1.7396038779005358e-05, + "loss": 0.9318, + "step": 649 + }, + { + "epoch": 0.23818248442652987, + "grad_norm": 1.2221083641052246, + "learning_rate": 1.738825736306625e-05, + "loss": 0.9838, + "step": 650 + }, + { + "epoch": 0.2385489190179553, + "grad_norm": 1.365902066230774, + "learning_rate": 1.7380466083803198e-05, + "loss": 0.8987, + "step": 651 + }, + { + "epoch": 0.23891535360938074, + "grad_norm": 1.3669956922531128, + "learning_rate": 1.7372664951617557e-05, + "loss": 0.9071, + "step": 652 + }, + { + "epoch": 0.23928178820080614, + "grad_norm": 1.4014561176300049, + "learning_rate": 1.7364853976923835e-05, + "loss": 0.8403, + "step": 653 + }, + { + "epoch": 0.23964822279223158, + "grad_norm": 1.188332200050354, + "learning_rate": 1.7357033170149692e-05, + "loss": 0.9235, + "step": 654 + }, + { + "epoch": 0.24001465738365702, + "grad_norm": 1.2374694347381592, + "learning_rate": 1.7349202541735895e-05, + "loss": 0.9465, + "step": 655 + }, + { + "epoch": 0.24038109197508245, + "grad_norm": 1.3026034832000732, + "learning_rate": 1.734136210213634e-05, + "loss": 0.9048, + "step": 656 + }, + { + "epoch": 0.24074752656650789, + "grad_norm": 1.3879164457321167, + "learning_rate": 1.733351186181802e-05, + "loss": 0.8876, + "step": 657 + }, + { + "epoch": 0.24111396115793332, + "grad_norm": 1.3328919410705566, + "learning_rate": 1.732565183126099e-05, + "loss": 0.8552, + "step": 658 + }, + { + "epoch": 0.24148039574935873, + "grad_norm": 1.3270750045776367, + "learning_rate": 1.7317782020958403e-05, + "loss": 0.8985, + "step": 659 + }, + { + "epoch": 0.24184683034078416, + "grad_norm": 1.2228972911834717, + "learning_rate": 1.730990244141646e-05, + "loss": 0.8599, + "step": 660 + }, + { + "epoch": 0.2422132649322096, + "grad_norm": 1.3068444728851318, + "learning_rate": 1.7302013103154386e-05, + "loss": 0.9188, + "step": 661 + }, + { + "epoch": 0.24257969952363503, + "grad_norm": 1.3953183889389038, + "learning_rate": 1.7294114016704458e-05, + "loss": 0.8665, + "step": 662 + }, + { + "epoch": 0.24294613411506047, + "grad_norm": 1.2629730701446533, + "learning_rate": 1.728620519261196e-05, + "loss": 0.9255, + "step": 663 + }, + { + "epoch": 0.2433125687064859, + "grad_norm": 1.3211989402770996, + "learning_rate": 1.727828664143516e-05, + "loss": 0.9707, + "step": 664 + }, + { + "epoch": 0.2436790032979113, + "grad_norm": 1.4088813066482544, + "learning_rate": 1.7270358373745334e-05, + "loss": 0.9134, + "step": 665 + }, + { + "epoch": 0.24404543788933675, + "grad_norm": 1.3637491464614868, + "learning_rate": 1.7262420400126717e-05, + "loss": 0.8627, + "step": 666 + }, + { + "epoch": 0.24441187248076218, + "grad_norm": 1.5863977670669556, + "learning_rate": 1.72544727311765e-05, + "loss": 0.8357, + "step": 667 + }, + { + "epoch": 0.24477830707218762, + "grad_norm": 1.2949635982513428, + "learning_rate": 1.724651537750483e-05, + "loss": 0.9373, + "step": 668 + }, + { + "epoch": 0.24514474166361305, + "grad_norm": 1.5079573392868042, + "learning_rate": 1.7238548349734773e-05, + "loss": 0.869, + "step": 669 + }, + { + "epoch": 0.2455111762550385, + "grad_norm": 1.920493483543396, + "learning_rate": 1.7230571658502303e-05, + "loss": 0.9121, + "step": 670 + }, + { + "epoch": 0.2458776108464639, + "grad_norm": 1.531829833984375, + "learning_rate": 1.7222585314456312e-05, + "loss": 0.8534, + "step": 671 + }, + { + "epoch": 0.24624404543788933, + "grad_norm": 1.6908535957336426, + "learning_rate": 1.7214589328258565e-05, + "loss": 0.8029, + "step": 672 + }, + { + "epoch": 0.24661048002931477, + "grad_norm": 1.3261759281158447, + "learning_rate": 1.7206583710583707e-05, + "loss": 0.9446, + "step": 673 + }, + { + "epoch": 0.2469769146207402, + "grad_norm": 1.347502589225769, + "learning_rate": 1.719856847211924e-05, + "loss": 0.9782, + "step": 674 + }, + { + "epoch": 0.24734334921216564, + "grad_norm": 1.199223518371582, + "learning_rate": 1.7190543623565498e-05, + "loss": 0.9558, + "step": 675 + }, + { + "epoch": 0.24770978380359107, + "grad_norm": 1.1717863082885742, + "learning_rate": 1.718250917563566e-05, + "loss": 0.9372, + "step": 676 + }, + { + "epoch": 0.24807621839501648, + "grad_norm": 1.303219199180603, + "learning_rate": 1.717446513905572e-05, + "loss": 0.923, + "step": 677 + }, + { + "epoch": 0.24844265298644191, + "grad_norm": 1.276764988899231, + "learning_rate": 1.716641152456446e-05, + "loss": 0.8635, + "step": 678 + }, + { + "epoch": 0.24880908757786735, + "grad_norm": 1.1870089769363403, + "learning_rate": 1.7158348342913458e-05, + "loss": 0.8853, + "step": 679 + }, + { + "epoch": 0.24917552216929278, + "grad_norm": 1.3337750434875488, + "learning_rate": 1.715027560486706e-05, + "loss": 0.8399, + "step": 680 + }, + { + "epoch": 0.24954195676071822, + "grad_norm": 1.6048245429992676, + "learning_rate": 1.7142193321202375e-05, + "loss": 0.8518, + "step": 681 + }, + { + "epoch": 0.24990839135214366, + "grad_norm": 1.397851586341858, + "learning_rate": 1.7134101502709252e-05, + "loss": 0.8988, + "step": 682 + }, + { + "epoch": 0.2502748259435691, + "grad_norm": 1.3171697854995728, + "learning_rate": 1.712600016019027e-05, + "loss": 0.8739, + "step": 683 + }, + { + "epoch": 0.2502748259435691, + "eval_loss": 0.8078320622444153, + "eval_runtime": 796.026, + "eval_samples_per_second": 3.407, + "eval_steps_per_second": 0.426, + "step": 683 + }, + { + "epoch": 0.2506412605349945, + "grad_norm": 1.196366548538208, + "learning_rate": 1.711788930446071e-05, + "loss": 0.9273, + "step": 684 + }, + { + "epoch": 0.25100769512641996, + "grad_norm": 1.5654913187026978, + "learning_rate": 1.7109768946348583e-05, + "loss": 0.7878, + "step": 685 + }, + { + "epoch": 0.25137412971784534, + "grad_norm": 1.4333325624465942, + "learning_rate": 1.7101639096694556e-05, + "loss": 0.9278, + "step": 686 + }, + { + "epoch": 0.2517405643092708, + "grad_norm": 1.56187903881073, + "learning_rate": 1.709349976635198e-05, + "loss": 0.8064, + "step": 687 + }, + { + "epoch": 0.2521069989006962, + "grad_norm": 1.242186188697815, + "learning_rate": 1.7085350966186865e-05, + "loss": 0.8858, + "step": 688 + }, + { + "epoch": 0.25247343349212165, + "grad_norm": 1.4079562425613403, + "learning_rate": 1.7077192707077854e-05, + "loss": 0.9868, + "step": 689 + }, + { + "epoch": 0.2528398680835471, + "grad_norm": 1.2565964460372925, + "learning_rate": 1.706902499991623e-05, + "loss": 0.9144, + "step": 690 + }, + { + "epoch": 0.2532063026749725, + "grad_norm": 1.2119742631912231, + "learning_rate": 1.7060847855605876e-05, + "loss": 0.9579, + "step": 691 + }, + { + "epoch": 0.25357273726639795, + "grad_norm": 1.210569143295288, + "learning_rate": 1.7052661285063287e-05, + "loss": 0.8889, + "step": 692 + }, + { + "epoch": 0.2539391718578234, + "grad_norm": 1.507573127746582, + "learning_rate": 1.7044465299217526e-05, + "loss": 0.8981, + "step": 693 + }, + { + "epoch": 0.2543056064492488, + "grad_norm": 1.2725515365600586, + "learning_rate": 1.7036259909010246e-05, + "loss": 0.9347, + "step": 694 + }, + { + "epoch": 0.25467204104067426, + "grad_norm": 1.3163098096847534, + "learning_rate": 1.7028045125395635e-05, + "loss": 0.8776, + "step": 695 + }, + { + "epoch": 0.2550384756320997, + "grad_norm": 1.2225093841552734, + "learning_rate": 1.7019820959340426e-05, + "loss": 0.8471, + "step": 696 + }, + { + "epoch": 0.25540491022352513, + "grad_norm": 1.6468865871429443, + "learning_rate": 1.7011587421823894e-05, + "loss": 0.8818, + "step": 697 + }, + { + "epoch": 0.2557713448149505, + "grad_norm": 1.2895342111587524, + "learning_rate": 1.7003344523837803e-05, + "loss": 0.917, + "step": 698 + }, + { + "epoch": 0.25613777940637594, + "grad_norm": 1.3217147588729858, + "learning_rate": 1.699509227638642e-05, + "loss": 0.9249, + "step": 699 + }, + { + "epoch": 0.2565042139978014, + "grad_norm": 1.495193600654602, + "learning_rate": 1.6986830690486507e-05, + "loss": 0.8172, + "step": 700 + }, + { + "epoch": 0.2568706485892268, + "grad_norm": 1.4029464721679688, + "learning_rate": 1.697855977716727e-05, + "loss": 0.8741, + "step": 701 + }, + { + "epoch": 0.25723708318065225, + "grad_norm": 1.5721744298934937, + "learning_rate": 1.6970279547470382e-05, + "loss": 0.9183, + "step": 702 + }, + { + "epoch": 0.2576035177720777, + "grad_norm": 1.5230120420455933, + "learning_rate": 1.6961990012449955e-05, + "loss": 0.8991, + "step": 703 + }, + { + "epoch": 0.2579699523635031, + "grad_norm": 1.426965355873108, + "learning_rate": 1.695369118317251e-05, + "loss": 0.8526, + "step": 704 + }, + { + "epoch": 0.25833638695492855, + "grad_norm": 1.231719970703125, + "learning_rate": 1.694538307071699e-05, + "loss": 0.8871, + "step": 705 + }, + { + "epoch": 0.258702821546354, + "grad_norm": 1.3230870962142944, + "learning_rate": 1.6937065686174728e-05, + "loss": 0.9368, + "step": 706 + }, + { + "epoch": 0.2590692561377794, + "grad_norm": 1.194519281387329, + "learning_rate": 1.692873904064943e-05, + "loss": 0.9452, + "step": 707 + }, + { + "epoch": 0.25943569072920486, + "grad_norm": 1.2940521240234375, + "learning_rate": 1.692040314525716e-05, + "loss": 0.9218, + "step": 708 + }, + { + "epoch": 0.2598021253206303, + "grad_norm": 1.2810473442077637, + "learning_rate": 1.6912058011126356e-05, + "loss": 0.9535, + "step": 709 + }, + { + "epoch": 0.2601685599120557, + "grad_norm": 1.210940957069397, + "learning_rate": 1.690370364939776e-05, + "loss": 0.9243, + "step": 710 + }, + { + "epoch": 0.2605349945034811, + "grad_norm": 1.413370132446289, + "learning_rate": 1.6895340071224446e-05, + "loss": 0.9291, + "step": 711 + }, + { + "epoch": 0.26090142909490655, + "grad_norm": 1.3101171255111694, + "learning_rate": 1.68869672877718e-05, + "loss": 0.8924, + "step": 712 + }, + { + "epoch": 0.261267863686332, + "grad_norm": 1.2237186431884766, + "learning_rate": 1.6878585310217472e-05, + "loss": 0.9246, + "step": 713 + }, + { + "epoch": 0.2616342982777574, + "grad_norm": 1.2307909727096558, + "learning_rate": 1.6870194149751417e-05, + "loss": 0.9654, + "step": 714 + }, + { + "epoch": 0.26200073286918285, + "grad_norm": 1.3479371070861816, + "learning_rate": 1.686179381757583e-05, + "loss": 0.8926, + "step": 715 + }, + { + "epoch": 0.2623671674606083, + "grad_norm": 1.4196346998214722, + "learning_rate": 1.685338432490516e-05, + "loss": 0.8783, + "step": 716 + }, + { + "epoch": 0.2627336020520337, + "grad_norm": 1.2949929237365723, + "learning_rate": 1.6844965682966067e-05, + "loss": 0.9335, + "step": 717 + }, + { + "epoch": 0.26310003664345916, + "grad_norm": 1.2742618322372437, + "learning_rate": 1.683653790299745e-05, + "loss": 0.9229, + "step": 718 + }, + { + "epoch": 0.2634664712348846, + "grad_norm": 1.468691110610962, + "learning_rate": 1.682810099625039e-05, + "loss": 0.7729, + "step": 719 + }, + { + "epoch": 0.26383290582631, + "grad_norm": 1.296600580215454, + "learning_rate": 1.6819654973988165e-05, + "loss": 0.9306, + "step": 720 + }, + { + "epoch": 0.2641993404177354, + "grad_norm": 1.3135846853256226, + "learning_rate": 1.6811199847486205e-05, + "loss": 0.8773, + "step": 721 + }, + { + "epoch": 0.26456577500916084, + "grad_norm": 1.3318557739257812, + "learning_rate": 1.680273562803211e-05, + "loss": 0.8922, + "step": 722 + }, + { + "epoch": 0.2649322096005863, + "grad_norm": 1.3065025806427002, + "learning_rate": 1.6794262326925616e-05, + "loss": 0.9231, + "step": 723 + }, + { + "epoch": 0.2652986441920117, + "grad_norm": 1.2115901708602905, + "learning_rate": 1.6785779955478572e-05, + "loss": 0.8406, + "step": 724 + }, + { + "epoch": 0.26566507878343715, + "grad_norm": 1.3239742517471313, + "learning_rate": 1.6777288525014958e-05, + "loss": 0.9726, + "step": 725 + }, + { + "epoch": 0.2660315133748626, + "grad_norm": 1.3156651258468628, + "learning_rate": 1.6768788046870817e-05, + "loss": 0.9525, + "step": 726 + }, + { + "epoch": 0.266397947966288, + "grad_norm": 1.624005675315857, + "learning_rate": 1.67602785323943e-05, + "loss": 0.8564, + "step": 727 + }, + { + "epoch": 0.26676438255771345, + "grad_norm": 1.3210386037826538, + "learning_rate": 1.675175999294561e-05, + "loss": 0.858, + "step": 728 + }, + { + "epoch": 0.2671308171491389, + "grad_norm": 1.3814237117767334, + "learning_rate": 1.674323243989699e-05, + "loss": 0.8559, + "step": 729 + }, + { + "epoch": 0.2674972517405643, + "grad_norm": 1.353384256362915, + "learning_rate": 1.6734695884632723e-05, + "loss": 0.8697, + "step": 730 + }, + { + "epoch": 0.26786368633198976, + "grad_norm": 1.5542417764663696, + "learning_rate": 1.6726150338549117e-05, + "loss": 0.9452, + "step": 731 + }, + { + "epoch": 0.2682301209234152, + "grad_norm": 1.4018888473510742, + "learning_rate": 1.671759581305448e-05, + "loss": 0.9139, + "step": 732 + }, + { + "epoch": 0.2685965555148406, + "grad_norm": 1.525744080543518, + "learning_rate": 1.6709032319569088e-05, + "loss": 0.9375, + "step": 733 + }, + { + "epoch": 0.268962990106266, + "grad_norm": 1.343700647354126, + "learning_rate": 1.670045986952522e-05, + "loss": 0.9302, + "step": 734 + }, + { + "epoch": 0.26932942469769144, + "grad_norm": 1.2460815906524658, + "learning_rate": 1.66918784743671e-05, + "loss": 0.8872, + "step": 735 + }, + { + "epoch": 0.2696958592891169, + "grad_norm": 1.4924426078796387, + "learning_rate": 1.6683288145550873e-05, + "loss": 0.8286, + "step": 736 + }, + { + "epoch": 0.2700622938805423, + "grad_norm": 1.3389033079147339, + "learning_rate": 1.667468889454464e-05, + "loss": 0.8757, + "step": 737 + }, + { + "epoch": 0.27042872847196775, + "grad_norm": 1.362754464149475, + "learning_rate": 1.6666080732828405e-05, + "loss": 0.9041, + "step": 738 + }, + { + "epoch": 0.2707951630633932, + "grad_norm": 1.2567392587661743, + "learning_rate": 1.6657463671894055e-05, + "loss": 0.9453, + "step": 739 + }, + { + "epoch": 0.2711615976548186, + "grad_norm": 1.3294352293014526, + "learning_rate": 1.6648837723245374e-05, + "loss": 0.8718, + "step": 740 + }, + { + "epoch": 0.27152803224624406, + "grad_norm": 1.354974627494812, + "learning_rate": 1.6640202898398007e-05, + "loss": 0.9177, + "step": 741 + }, + { + "epoch": 0.2718944668376695, + "grad_norm": 1.6166919469833374, + "learning_rate": 1.6631559208879432e-05, + "loss": 0.7697, + "step": 742 + }, + { + "epoch": 0.2722609014290949, + "grad_norm": 1.4290088415145874, + "learning_rate": 1.6622906666228983e-05, + "loss": 0.9476, + "step": 743 + }, + { + "epoch": 0.27262733602052036, + "grad_norm": 1.226045846939087, + "learning_rate": 1.661424528199781e-05, + "loss": 0.9194, + "step": 744 + }, + { + "epoch": 0.27299377061194574, + "grad_norm": 1.4980007410049438, + "learning_rate": 1.660557506774885e-05, + "loss": 0.9057, + "step": 745 + }, + { + "epoch": 0.2733602052033712, + "grad_norm": 1.4683295488357544, + "learning_rate": 1.6596896035056853e-05, + "loss": 0.8626, + "step": 746 + }, + { + "epoch": 0.2737266397947966, + "grad_norm": 1.223743200302124, + "learning_rate": 1.6588208195508316e-05, + "loss": 0.9407, + "step": 747 + }, + { + "epoch": 0.27409307438622205, + "grad_norm": 1.3194293975830078, + "learning_rate": 1.6579511560701513e-05, + "loss": 0.9266, + "step": 748 + }, + { + "epoch": 0.2744595089776475, + "grad_norm": 1.3832615613937378, + "learning_rate": 1.6570806142246448e-05, + "loss": 0.91, + "step": 749 + }, + { + "epoch": 0.2748259435690729, + "grad_norm": 1.4815707206726074, + "learning_rate": 1.656209195176485e-05, + "loss": 0.8604, + "step": 750 + }, + { + "epoch": 0.27519237816049835, + "grad_norm": 1.290116786956787, + "learning_rate": 1.655336900089018e-05, + "loss": 0.9047, + "step": 751 + }, + { + "epoch": 0.2755588127519238, + "grad_norm": 1.3829827308654785, + "learning_rate": 1.6544637301267558e-05, + "loss": 0.8946, + "step": 752 + }, + { + "epoch": 0.2759252473433492, + "grad_norm": 1.3662350177764893, + "learning_rate": 1.6535896864553818e-05, + "loss": 0.9196, + "step": 753 + }, + { + "epoch": 0.27629168193477466, + "grad_norm": 1.3391906023025513, + "learning_rate": 1.652714770241744e-05, + "loss": 0.9328, + "step": 754 + }, + { + "epoch": 0.2766581165262001, + "grad_norm": 1.3064299821853638, + "learning_rate": 1.651838982653855e-05, + "loss": 0.8811, + "step": 755 + }, + { + "epoch": 0.27702455111762553, + "grad_norm": 1.387911081314087, + "learning_rate": 1.650962324860892e-05, + "loss": 0.973, + "step": 756 + }, + { + "epoch": 0.2773909857090509, + "grad_norm": 1.292227029800415, + "learning_rate": 1.6500847980331923e-05, + "loss": 0.9254, + "step": 757 + }, + { + "epoch": 0.27775742030047634, + "grad_norm": 1.4158412218093872, + "learning_rate": 1.6492064033422554e-05, + "loss": 0.9298, + "step": 758 + }, + { + "epoch": 0.2781238548919018, + "grad_norm": 1.4395830631256104, + "learning_rate": 1.648327141960738e-05, + "loss": 0.9033, + "step": 759 + }, + { + "epoch": 0.2784902894833272, + "grad_norm": 1.3105932474136353, + "learning_rate": 1.6474470150624533e-05, + "loss": 0.8909, + "step": 760 + }, + { + "epoch": 0.27885672407475265, + "grad_norm": 1.6311067342758179, + "learning_rate": 1.646566023822371e-05, + "loss": 0.929, + "step": 761 + }, + { + "epoch": 0.2792231586661781, + "grad_norm": 1.4089274406433105, + "learning_rate": 1.6456841694166154e-05, + "loss": 0.8728, + "step": 762 + }, + { + "epoch": 0.2795895932576035, + "grad_norm": 1.506455659866333, + "learning_rate": 1.644801453022461e-05, + "loss": 0.8659, + "step": 763 + }, + { + "epoch": 0.27995602784902895, + "grad_norm": 1.3019893169403076, + "learning_rate": 1.643917875818335e-05, + "loss": 0.8047, + "step": 764 + }, + { + "epoch": 0.2803224624404544, + "grad_norm": 1.3945775032043457, + "learning_rate": 1.6430334389838126e-05, + "loss": 0.9586, + "step": 765 + }, + { + "epoch": 0.2806888970318798, + "grad_norm": 1.3473881483078003, + "learning_rate": 1.6421481436996173e-05, + "loss": 0.8552, + "step": 766 + }, + { + "epoch": 0.28105533162330526, + "grad_norm": 1.5067903995513916, + "learning_rate": 1.641261991147618e-05, + "loss": 0.9138, + "step": 767 + }, + { + "epoch": 0.2814217662147307, + "grad_norm": 1.4067862033843994, + "learning_rate": 1.6403749825108286e-05, + "loss": 0.8866, + "step": 768 + }, + { + "epoch": 0.2817882008061561, + "grad_norm": 1.6273012161254883, + "learning_rate": 1.6394871189734057e-05, + "loss": 0.8841, + "step": 769 + }, + { + "epoch": 0.2821546353975815, + "grad_norm": 1.539231538772583, + "learning_rate": 1.638598401720647e-05, + "loss": 0.8788, + "step": 770 + }, + { + "epoch": 0.28252106998900695, + "grad_norm": 1.7438411712646484, + "learning_rate": 1.6377088319389904e-05, + "loss": 0.8567, + "step": 771 + }, + { + "epoch": 0.2828875045804324, + "grad_norm": 1.4629979133605957, + "learning_rate": 1.6368184108160114e-05, + "loss": 0.8732, + "step": 772 + }, + { + "epoch": 0.2832539391718578, + "grad_norm": 1.3000826835632324, + "learning_rate": 1.635927139540422e-05, + "loss": 0.8814, + "step": 773 + }, + { + "epoch": 0.28362037376328325, + "grad_norm": 1.4041448831558228, + "learning_rate": 1.63503501930207e-05, + "loss": 0.9357, + "step": 774 + }, + { + "epoch": 0.2839868083547087, + "grad_norm": 1.3833987712860107, + "learning_rate": 1.6341420512919355e-05, + "loss": 0.8805, + "step": 775 + }, + { + "epoch": 0.2843532429461341, + "grad_norm": 1.5424768924713135, + "learning_rate": 1.6332482367021308e-05, + "loss": 0.8681, + "step": 776 + }, + { + "epoch": 0.28471967753755956, + "grad_norm": 1.3150067329406738, + "learning_rate": 1.6323535767258987e-05, + "loss": 0.904, + "step": 777 + }, + { + "epoch": 0.285086112128985, + "grad_norm": 1.4194296598434448, + "learning_rate": 1.63145807255761e-05, + "loss": 0.9059, + "step": 778 + }, + { + "epoch": 0.2854525467204104, + "grad_norm": 1.305395245552063, + "learning_rate": 1.6305617253927635e-05, + "loss": 0.9258, + "step": 779 + }, + { + "epoch": 0.28581898131183586, + "grad_norm": 1.4417041540145874, + "learning_rate": 1.6296645364279816e-05, + "loss": 0.886, + "step": 780 + }, + { + "epoch": 0.28618541590326124, + "grad_norm": 1.3115651607513428, + "learning_rate": 1.6287665068610127e-05, + "loss": 0.9332, + "step": 781 + }, + { + "epoch": 0.2865518504946867, + "grad_norm": 1.4982367753982544, + "learning_rate": 1.6278676378907258e-05, + "loss": 0.8913, + "step": 782 + }, + { + "epoch": 0.2869182850861121, + "grad_norm": 1.3922163248062134, + "learning_rate": 1.6269679307171108e-05, + "loss": 0.9012, + "step": 783 + }, + { + "epoch": 0.28728471967753755, + "grad_norm": 1.4132483005523682, + "learning_rate": 1.626067386541277e-05, + "loss": 0.8996, + "step": 784 + }, + { + "epoch": 0.287651154268963, + "grad_norm": 1.4408533573150635, + "learning_rate": 1.6251660065654513e-05, + "loss": 0.9164, + "step": 785 + }, + { + "epoch": 0.2880175888603884, + "grad_norm": 1.3485194444656372, + "learning_rate": 1.6242637919929752e-05, + "loss": 0.9189, + "step": 786 + }, + { + "epoch": 0.28838402345181385, + "grad_norm": 1.4074796438217163, + "learning_rate": 1.6233607440283066e-05, + "loss": 0.8408, + "step": 787 + }, + { + "epoch": 0.2887504580432393, + "grad_norm": 1.3223270177841187, + "learning_rate": 1.6224568638770127e-05, + "loss": 0.9216, + "step": 788 + }, + { + "epoch": 0.2891168926346647, + "grad_norm": 1.3968936204910278, + "learning_rate": 1.6215521527457752e-05, + "loss": 0.8787, + "step": 789 + }, + { + "epoch": 0.28948332722609016, + "grad_norm": 1.5885320901870728, + "learning_rate": 1.6206466118423824e-05, + "loss": 0.8786, + "step": 790 + }, + { + "epoch": 0.2898497618175156, + "grad_norm": 1.3874950408935547, + "learning_rate": 1.6197402423757323e-05, + "loss": 0.902, + "step": 791 + }, + { + "epoch": 0.29021619640894103, + "grad_norm": 1.279403805732727, + "learning_rate": 1.6188330455558277e-05, + "loss": 0.9003, + "step": 792 + }, + { + "epoch": 0.2905826310003664, + "grad_norm": 1.3086025714874268, + "learning_rate": 1.6179250225937764e-05, + "loss": 0.8931, + "step": 793 + }, + { + "epoch": 0.29094906559179184, + "grad_norm": 1.386427402496338, + "learning_rate": 1.617016174701789e-05, + "loss": 0.9941, + "step": 794 + }, + { + "epoch": 0.2913155001832173, + "grad_norm": 1.4497308731079102, + "learning_rate": 1.6161065030931777e-05, + "loss": 0.8774, + "step": 795 + }, + { + "epoch": 0.2916819347746427, + "grad_norm": 1.4114580154418945, + "learning_rate": 1.6151960089823542e-05, + "loss": 0.8589, + "step": 796 + }, + { + "epoch": 0.29204836936606815, + "grad_norm": 1.627975344657898, + "learning_rate": 1.614284693584828e-05, + "loss": 0.9294, + "step": 797 + }, + { + "epoch": 0.2924148039574936, + "grad_norm": 1.55379056930542, + "learning_rate": 1.6133725581172056e-05, + "loss": 0.7997, + "step": 798 + }, + { + "epoch": 0.292781238548919, + "grad_norm": 1.4049032926559448, + "learning_rate": 1.6124596037971867e-05, + "loss": 0.8726, + "step": 799 + }, + { + "epoch": 0.29314767314034446, + "grad_norm": 1.6755776405334473, + "learning_rate": 1.611545831843567e-05, + "loss": 0.8759, + "step": 800 + }, + { + "epoch": 0.2935141077317699, + "grad_norm": 1.4013546705245972, + "learning_rate": 1.6106312434762315e-05, + "loss": 0.9327, + "step": 801 + }, + { + "epoch": 0.2938805423231953, + "grad_norm": 1.3094841241836548, + "learning_rate": 1.609715839916155e-05, + "loss": 0.9164, + "step": 802 + }, + { + "epoch": 0.29424697691462076, + "grad_norm": 1.4455472230911255, + "learning_rate": 1.608799622385402e-05, + "loss": 0.8573, + "step": 803 + }, + { + "epoch": 0.2946134115060462, + "grad_norm": 1.357675552368164, + "learning_rate": 1.607882592107123e-05, + "loss": 0.8871, + "step": 804 + }, + { + "epoch": 0.2949798460974716, + "grad_norm": 1.3851970434188843, + "learning_rate": 1.6069647503055532e-05, + "loss": 0.8517, + "step": 805 + }, + { + "epoch": 0.295346280688897, + "grad_norm": 1.5386179685592651, + "learning_rate": 1.6060460982060116e-05, + "loss": 0.854, + "step": 806 + }, + { + "epoch": 0.29571271528032245, + "grad_norm": 1.4447675943374634, + "learning_rate": 1.6051266370348985e-05, + "loss": 0.845, + "step": 807 + }, + { + "epoch": 0.2960791498717479, + "grad_norm": 1.5048695802688599, + "learning_rate": 1.604206368019695e-05, + "loss": 0.852, + "step": 808 + }, + { + "epoch": 0.2964455844631733, + "grad_norm": 1.702054500579834, + "learning_rate": 1.60328529238896e-05, + "loss": 0.8974, + "step": 809 + }, + { + "epoch": 0.29681201905459875, + "grad_norm": 1.6284360885620117, + "learning_rate": 1.6023634113723297e-05, + "loss": 0.9014, + "step": 810 + }, + { + "epoch": 0.2971784536460242, + "grad_norm": 1.4068750143051147, + "learning_rate": 1.6014407262005152e-05, + "loss": 0.8157, + "step": 811 + }, + { + "epoch": 0.2975448882374496, + "grad_norm": 1.3355695009231567, + "learning_rate": 1.6005172381053012e-05, + "loss": 0.8513, + "step": 812 + }, + { + "epoch": 0.29791132282887506, + "grad_norm": 1.4846398830413818, + "learning_rate": 1.5995929483195443e-05, + "loss": 0.8568, + "step": 813 + }, + { + "epoch": 0.2982777574203005, + "grad_norm": 1.259242296218872, + "learning_rate": 1.598667858077172e-05, + "loss": 0.9381, + "step": 814 + }, + { + "epoch": 0.29864419201172593, + "grad_norm": 1.4263806343078613, + "learning_rate": 1.597741968613179e-05, + "loss": 0.8763, + "step": 815 + }, + { + "epoch": 0.29901062660315136, + "grad_norm": 1.3765134811401367, + "learning_rate": 1.5968152811636283e-05, + "loss": 0.8603, + "step": 816 + }, + { + "epoch": 0.29937706119457674, + "grad_norm": 1.2649085521697998, + "learning_rate": 1.595887796965647e-05, + "loss": 0.9276, + "step": 817 + }, + { + "epoch": 0.2997434957860022, + "grad_norm": 1.4955521821975708, + "learning_rate": 1.594959517257428e-05, + "loss": 0.844, + "step": 818 + }, + { + "epoch": 0.3001099303774276, + "grad_norm": 1.2310223579406738, + "learning_rate": 1.5940304432782233e-05, + "loss": 0.8783, + "step": 819 + }, + { + "epoch": 0.30047636496885305, + "grad_norm": 1.319631576538086, + "learning_rate": 1.5931005762683473e-05, + "loss": 0.9017, + "step": 820 + }, + { + "epoch": 0.3008427995602785, + "grad_norm": 1.3764492273330688, + "learning_rate": 1.5921699174691724e-05, + "loss": 0.832, + "step": 821 + }, + { + "epoch": 0.3012092341517039, + "grad_norm": 1.4033286571502686, + "learning_rate": 1.5912384681231282e-05, + "loss": 0.8962, + "step": 822 + }, + { + "epoch": 0.30157566874312935, + "grad_norm": 1.5620622634887695, + "learning_rate": 1.5903062294737e-05, + "loss": 0.9079, + "step": 823 + }, + { + "epoch": 0.3019421033345548, + "grad_norm": 1.4058263301849365, + "learning_rate": 1.5893732027654258e-05, + "loss": 0.9625, + "step": 824 + }, + { + "epoch": 0.3023085379259802, + "grad_norm": 1.4301252365112305, + "learning_rate": 1.5884393892438962e-05, + "loss": 0.8528, + "step": 825 + }, + { + "epoch": 0.30267497251740566, + "grad_norm": 1.5510419607162476, + "learning_rate": 1.587504790155753e-05, + "loss": 0.858, + "step": 826 + }, + { + "epoch": 0.3030414071088311, + "grad_norm": 1.5547479391098022, + "learning_rate": 1.586569406748686e-05, + "loss": 0.9054, + "step": 827 + }, + { + "epoch": 0.30340784170025653, + "grad_norm": 1.5077707767486572, + "learning_rate": 1.585633240271431e-05, + "loss": 0.8879, + "step": 828 + }, + { + "epoch": 0.3037742762916819, + "grad_norm": 1.394148588180542, + "learning_rate": 1.5846962919737717e-05, + "loss": 0.8883, + "step": 829 + }, + { + "epoch": 0.30414071088310735, + "grad_norm": 1.3555182218551636, + "learning_rate": 1.5837585631065334e-05, + "loss": 0.904, + "step": 830 + }, + { + "epoch": 0.3045071454745328, + "grad_norm": 1.3965303897857666, + "learning_rate": 1.5828200549215843e-05, + "loss": 0.8471, + "step": 831 + }, + { + "epoch": 0.3048735800659582, + "grad_norm": 1.4720205068588257, + "learning_rate": 1.581880768671833e-05, + "loss": 0.8241, + "step": 832 + }, + { + "epoch": 0.30524001465738365, + "grad_norm": 1.3568192720413208, + "learning_rate": 1.580940705611226e-05, + "loss": 0.8405, + "step": 833 + }, + { + "epoch": 0.3056064492488091, + "grad_norm": 1.3553125858306885, + "learning_rate": 1.5799998669947485e-05, + "loss": 0.8948, + "step": 834 + }, + { + "epoch": 0.3059728838402345, + "grad_norm": 1.5014175176620483, + "learning_rate": 1.5790582540784196e-05, + "loss": 0.8631, + "step": 835 + }, + { + "epoch": 0.30633931843165996, + "grad_norm": 1.5766485929489136, + "learning_rate": 1.5781158681192928e-05, + "loss": 0.8352, + "step": 836 + }, + { + "epoch": 0.3067057530230854, + "grad_norm": 1.4493907690048218, + "learning_rate": 1.577172710375453e-05, + "loss": 0.8646, + "step": 837 + }, + { + "epoch": 0.3070721876145108, + "grad_norm": 1.5850780010223389, + "learning_rate": 1.5762287821060164e-05, + "loss": 0.8462, + "step": 838 + }, + { + "epoch": 0.30743862220593626, + "grad_norm": 1.5644487142562866, + "learning_rate": 1.575284084571127e-05, + "loss": 0.9149, + "step": 839 + }, + { + "epoch": 0.3078050567973617, + "grad_norm": 1.4901156425476074, + "learning_rate": 1.5743386190319556e-05, + "loss": 0.8686, + "step": 840 + }, + { + "epoch": 0.3081714913887871, + "grad_norm": 1.4039537906646729, + "learning_rate": 1.5733923867506995e-05, + "loss": 0.8827, + "step": 841 + }, + { + "epoch": 0.3085379259802125, + "grad_norm": 1.3826498985290527, + "learning_rate": 1.5724453889905787e-05, + "loss": 0.9294, + "step": 842 + }, + { + "epoch": 0.30890436057163795, + "grad_norm": 1.324507474899292, + "learning_rate": 1.571497627015835e-05, + "loss": 0.9455, + "step": 843 + }, + { + "epoch": 0.3092707951630634, + "grad_norm": 1.5789650678634644, + "learning_rate": 1.570549102091731e-05, + "loss": 0.8778, + "step": 844 + }, + { + "epoch": 0.3096372297544888, + "grad_norm": 1.2591837644577026, + "learning_rate": 1.5695998154845468e-05, + "loss": 0.9207, + "step": 845 + }, + { + "epoch": 0.31000366434591425, + "grad_norm": 1.3110920190811157, + "learning_rate": 1.5686497684615804e-05, + "loss": 0.9022, + "step": 846 + }, + { + "epoch": 0.3103700989373397, + "grad_norm": 1.377200722694397, + "learning_rate": 1.5676989622911457e-05, + "loss": 0.8624, + "step": 847 + }, + { + "epoch": 0.3107365335287651, + "grad_norm": 1.3229527473449707, + "learning_rate": 1.5667473982425673e-05, + "loss": 0.8857, + "step": 848 + }, + { + "epoch": 0.31110296812019056, + "grad_norm": 1.7923849821090698, + "learning_rate": 1.5657950775861842e-05, + "loss": 0.8445, + "step": 849 + }, + { + "epoch": 0.311469402711616, + "grad_norm": 1.4395514726638794, + "learning_rate": 1.5648420015933444e-05, + "loss": 0.9183, + "step": 850 + }, + { + "epoch": 0.31183583730304143, + "grad_norm": 1.5209360122680664, + "learning_rate": 1.5638881715364042e-05, + "loss": 0.879, + "step": 851 + }, + { + "epoch": 0.31220227189446687, + "grad_norm": 1.3047930002212524, + "learning_rate": 1.5629335886887272e-05, + "loss": 0.9366, + "step": 852 + }, + { + "epoch": 0.31256870648589224, + "grad_norm": 1.5573649406433105, + "learning_rate": 1.5619782543246813e-05, + "loss": 0.8785, + "step": 853 + }, + { + "epoch": 0.3129351410773177, + "grad_norm": 1.3706642389297485, + "learning_rate": 1.561022169719638e-05, + "loss": 0.9162, + "step": 854 + }, + { + "epoch": 0.3133015756687431, + "grad_norm": 1.303852915763855, + "learning_rate": 1.56006533614997e-05, + "loss": 0.8853, + "step": 855 + }, + { + "epoch": 0.31366801026016855, + "grad_norm": 1.4067341089248657, + "learning_rate": 1.55910775489305e-05, + "loss": 0.9383, + "step": 856 + }, + { + "epoch": 0.314034444851594, + "grad_norm": 1.5358949899673462, + "learning_rate": 1.55814942722725e-05, + "loss": 0.8933, + "step": 857 + }, + { + "epoch": 0.3144008794430194, + "grad_norm": 1.4367671012878418, + "learning_rate": 1.5571903544319365e-05, + "loss": 0.8596, + "step": 858 + }, + { + "epoch": 0.31476731403444486, + "grad_norm": 1.280152440071106, + "learning_rate": 1.556230537787472e-05, + "loss": 0.8472, + "step": 859 + }, + { + "epoch": 0.3151337486258703, + "grad_norm": 1.437868595123291, + "learning_rate": 1.555269978575212e-05, + "loss": 0.8706, + "step": 860 + }, + { + "epoch": 0.3155001832172957, + "grad_norm": 1.5736864805221558, + "learning_rate": 1.554308678077503e-05, + "loss": 0.8014, + "step": 861 + }, + { + "epoch": 0.31586661780872116, + "grad_norm": 1.3937286138534546, + "learning_rate": 1.5533466375776812e-05, + "loss": 0.9138, + "step": 862 + }, + { + "epoch": 0.3162330524001466, + "grad_norm": 1.6589691638946533, + "learning_rate": 1.5523838583600706e-05, + "loss": 0.8216, + "step": 863 + }, + { + "epoch": 0.316599486991572, + "grad_norm": 1.3993836641311646, + "learning_rate": 1.5514203417099816e-05, + "loss": 0.8878, + "step": 864 + }, + { + "epoch": 0.3169659215829974, + "grad_norm": 1.3711109161376953, + "learning_rate": 1.550456088913709e-05, + "loss": 0.919, + "step": 865 + }, + { + "epoch": 0.31733235617442285, + "grad_norm": 1.5643397569656372, + "learning_rate": 1.5494911012585303e-05, + "loss": 0.9188, + "step": 866 + }, + { + "epoch": 0.3176987907658483, + "grad_norm": 1.5048398971557617, + "learning_rate": 1.5485253800327043e-05, + "loss": 0.9647, + "step": 867 + }, + { + "epoch": 0.3180652253572737, + "grad_norm": 1.6283950805664062, + "learning_rate": 1.547558926525468e-05, + "loss": 0.813, + "step": 868 + }, + { + "epoch": 0.31843165994869915, + "grad_norm": 1.706950068473816, + "learning_rate": 1.546591742027038e-05, + "loss": 0.7791, + "step": 869 + }, + { + "epoch": 0.3187980945401246, + "grad_norm": 1.3241078853607178, + "learning_rate": 1.545623827828605e-05, + "loss": 0.922, + "step": 870 + }, + { + "epoch": 0.31916452913155, + "grad_norm": 1.6119778156280518, + "learning_rate": 1.5446551852223346e-05, + "loss": 0.8562, + "step": 871 + }, + { + "epoch": 0.31953096372297546, + "grad_norm": 1.4082729816436768, + "learning_rate": 1.5436858155013653e-05, + "loss": 0.9317, + "step": 872 + }, + { + "epoch": 0.3198973983144009, + "grad_norm": 1.4613032341003418, + "learning_rate": 1.5427157199598048e-05, + "loss": 0.8964, + "step": 873 + }, + { + "epoch": 0.32026383290582633, + "grad_norm": 1.325289249420166, + "learning_rate": 1.541744899892731e-05, + "loss": 0.9091, + "step": 874 + }, + { + "epoch": 0.32063026749725176, + "grad_norm": 1.4540269374847412, + "learning_rate": 1.5407733565961894e-05, + "loss": 0.8688, + "step": 875 + }, + { + "epoch": 0.32099670208867714, + "grad_norm": 1.56600821018219, + "learning_rate": 1.5398010913671894e-05, + "loss": 0.9103, + "step": 876 + }, + { + "epoch": 0.3213631366801026, + "grad_norm": 1.4235271215438843, + "learning_rate": 1.5388281055037054e-05, + "loss": 0.9022, + "step": 877 + }, + { + "epoch": 0.321729571271528, + "grad_norm": 1.42256498336792, + "learning_rate": 1.5378544003046743e-05, + "loss": 0.8512, + "step": 878 + }, + { + "epoch": 0.32209600586295345, + "grad_norm": 1.3547056913375854, + "learning_rate": 1.5368799770699915e-05, + "loss": 0.9118, + "step": 879 + }, + { + "epoch": 0.3224624404543789, + "grad_norm": 1.5990328788757324, + "learning_rate": 1.5359048371005128e-05, + "loss": 0.8412, + "step": 880 + }, + { + "epoch": 0.3228288750458043, + "grad_norm": 1.3500826358795166, + "learning_rate": 1.5349289816980498e-05, + "loss": 0.8604, + "step": 881 + }, + { + "epoch": 0.32319530963722976, + "grad_norm": 1.513796091079712, + "learning_rate": 1.5339524121653698e-05, + "loss": 0.8849, + "step": 882 + }, + { + "epoch": 0.3235617442286552, + "grad_norm": 1.3060262203216553, + "learning_rate": 1.532975129806193e-05, + "loss": 0.9494, + "step": 883 + }, + { + "epoch": 0.3239281788200806, + "grad_norm": 1.295326828956604, + "learning_rate": 1.531997135925191e-05, + "loss": 0.8887, + "step": 884 + }, + { + "epoch": 0.32429461341150606, + "grad_norm": 1.467307686805725, + "learning_rate": 1.5310184318279864e-05, + "loss": 0.8879, + "step": 885 + }, + { + "epoch": 0.3246610480029315, + "grad_norm": 1.5006343126296997, + "learning_rate": 1.530039018821149e-05, + "loss": 0.8955, + "step": 886 + }, + { + "epoch": 0.32502748259435693, + "grad_norm": 1.5569864511489868, + "learning_rate": 1.5290588982121945e-05, + "loss": 0.8606, + "step": 887 + }, + { + "epoch": 0.3253939171857823, + "grad_norm": 1.6372853517532349, + "learning_rate": 1.5280780713095852e-05, + "loss": 0.9186, + "step": 888 + }, + { + "epoch": 0.32576035177720775, + "grad_norm": 1.3921681642532349, + "learning_rate": 1.5270965394227237e-05, + "loss": 0.8669, + "step": 889 + }, + { + "epoch": 0.3261267863686332, + "grad_norm": 1.4064171314239502, + "learning_rate": 1.5261143038619562e-05, + "loss": 0.8973, + "step": 890 + }, + { + "epoch": 0.3264932209600586, + "grad_norm": 1.840980052947998, + "learning_rate": 1.525131365938567e-05, + "loss": 0.8078, + "step": 891 + }, + { + "epoch": 0.32685965555148405, + "grad_norm": 1.7794231176376343, + "learning_rate": 1.524147726964778e-05, + "loss": 0.7925, + "step": 892 + }, + { + "epoch": 0.3272260901429095, + "grad_norm": 1.3070979118347168, + "learning_rate": 1.5231633882537474e-05, + "loss": 0.8818, + "step": 893 + }, + { + "epoch": 0.3275925247343349, + "grad_norm": 1.4535861015319824, + "learning_rate": 1.5221783511195674e-05, + "loss": 0.8376, + "step": 894 + }, + { + "epoch": 0.32795895932576036, + "grad_norm": 1.4390013217926025, + "learning_rate": 1.5211926168772626e-05, + "loss": 0.8658, + "step": 895 + }, + { + "epoch": 0.3283253939171858, + "grad_norm": 1.4747962951660156, + "learning_rate": 1.5202061868427884e-05, + "loss": 0.9033, + "step": 896 + }, + { + "epoch": 0.32869182850861123, + "grad_norm": 1.7829028367996216, + "learning_rate": 1.519219062333029e-05, + "loss": 0.7904, + "step": 897 + }, + { + "epoch": 0.32905826310003666, + "grad_norm": 1.575325608253479, + "learning_rate": 1.5182312446657954e-05, + "loss": 0.8527, + "step": 898 + }, + { + "epoch": 0.3294246976914621, + "grad_norm": 1.4097166061401367, + "learning_rate": 1.5172427351598243e-05, + "loss": 0.8692, + "step": 899 + }, + { + "epoch": 0.3297911322828875, + "grad_norm": 1.6933417320251465, + "learning_rate": 1.516253535134776e-05, + "loss": 0.8665, + "step": 900 + }, + { + "epoch": 0.3301575668743129, + "grad_norm": 1.4317610263824463, + "learning_rate": 1.5152636459112327e-05, + "loss": 0.9007, + "step": 901 + }, + { + "epoch": 0.33052400146573835, + "grad_norm": 1.781380295753479, + "learning_rate": 1.5142730688106966e-05, + "loss": 0.835, + "step": 902 + }, + { + "epoch": 0.3308904360571638, + "grad_norm": 1.495296835899353, + "learning_rate": 1.5132818051555878e-05, + "loss": 0.9437, + "step": 903 + }, + { + "epoch": 0.3312568706485892, + "grad_norm": 1.6618905067443848, + "learning_rate": 1.5122898562692437e-05, + "loss": 0.8886, + "step": 904 + }, + { + "epoch": 0.33162330524001465, + "grad_norm": 1.5304425954818726, + "learning_rate": 1.5112972234759155e-05, + "loss": 0.889, + "step": 905 + }, + { + "epoch": 0.3319897398314401, + "grad_norm": 1.4052519798278809, + "learning_rate": 1.5103039081007691e-05, + "loss": 0.8726, + "step": 906 + }, + { + "epoch": 0.3323561744228655, + "grad_norm": 1.5464938879013062, + "learning_rate": 1.5093099114698797e-05, + "loss": 0.878, + "step": 907 + }, + { + "epoch": 0.33272260901429096, + "grad_norm": 1.5896848440170288, + "learning_rate": 1.5083152349102332e-05, + "loss": 0.9362, + "step": 908 + }, + { + "epoch": 0.3330890436057164, + "grad_norm": 1.5317317247390747, + "learning_rate": 1.5073198797497228e-05, + "loss": 0.8346, + "step": 909 + }, + { + "epoch": 0.33345547819714183, + "grad_norm": 1.634902834892273, + "learning_rate": 1.5063238473171475e-05, + "loss": 0.9317, + "step": 910 + }, + { + "epoch": 0.33382191278856727, + "grad_norm": 1.6703317165374756, + "learning_rate": 1.5053271389422111e-05, + "loss": 0.8432, + "step": 911 + }, + { + "epoch": 0.33418834737999265, + "grad_norm": 1.6900526285171509, + "learning_rate": 1.5043297559555188e-05, + "loss": 0.7934, + "step": 912 + }, + { + "epoch": 0.3345547819714181, + "grad_norm": 1.5616822242736816, + "learning_rate": 1.5033316996885772e-05, + "loss": 0.8728, + "step": 913 + }, + { + "epoch": 0.3349212165628435, + "grad_norm": 1.455959677696228, + "learning_rate": 1.5023329714737917e-05, + "loss": 0.8429, + "step": 914 + }, + { + "epoch": 0.33528765115426895, + "grad_norm": 1.4425702095031738, + "learning_rate": 1.5013335726444642e-05, + "loss": 0.9135, + "step": 915 + }, + { + "epoch": 0.3356540857456944, + "grad_norm": 1.6457642316818237, + "learning_rate": 1.5003335045347921e-05, + "loss": 0.8378, + "step": 916 + }, + { + "epoch": 0.3360205203371198, + "grad_norm": 1.4372854232788086, + "learning_rate": 1.4993327684798667e-05, + "loss": 0.9147, + "step": 917 + }, + { + "epoch": 0.33638695492854526, + "grad_norm": 1.5803145170211792, + "learning_rate": 1.4983313658156704e-05, + "loss": 0.8997, + "step": 918 + }, + { + "epoch": 0.3367533895199707, + "grad_norm": 1.4921263456344604, + "learning_rate": 1.497329297879076e-05, + "loss": 0.9059, + "step": 919 + }, + { + "epoch": 0.3371198241113961, + "grad_norm": 1.6669347286224365, + "learning_rate": 1.4963265660078441e-05, + "loss": 0.87, + "step": 920 + }, + { + "epoch": 0.33748625870282156, + "grad_norm": 1.3645561933517456, + "learning_rate": 1.4953231715406219e-05, + "loss": 0.896, + "step": 921 + }, + { + "epoch": 0.337852693294247, + "grad_norm": 1.6434755325317383, + "learning_rate": 1.494319115816941e-05, + "loss": 0.7876, + "step": 922 + }, + { + "epoch": 0.33821912788567243, + "grad_norm": 1.6944279670715332, + "learning_rate": 1.493314400177216e-05, + "loss": 0.8852, + "step": 923 + }, + { + "epoch": 0.3385855624770978, + "grad_norm": 1.7454200983047485, + "learning_rate": 1.4923090259627422e-05, + "loss": 0.7508, + "step": 924 + }, + { + "epoch": 0.33895199706852325, + "grad_norm": 1.5646865367889404, + "learning_rate": 1.491302994515694e-05, + "loss": 0.9465, + "step": 925 + }, + { + "epoch": 0.3393184316599487, + "grad_norm": 1.416644811630249, + "learning_rate": 1.490296307179124e-05, + "loss": 0.8758, + "step": 926 + }, + { + "epoch": 0.3396848662513741, + "grad_norm": 1.4960588216781616, + "learning_rate": 1.489288965296959e-05, + "loss": 0.9026, + "step": 927 + }, + { + "epoch": 0.34005130084279955, + "grad_norm": 1.7496960163116455, + "learning_rate": 1.4882809702140014e-05, + "loss": 0.8591, + "step": 928 + }, + { + "epoch": 0.340417735434225, + "grad_norm": 1.5206427574157715, + "learning_rate": 1.4872723232759243e-05, + "loss": 0.8428, + "step": 929 + }, + { + "epoch": 0.3407841700256504, + "grad_norm": 1.3755855560302734, + "learning_rate": 1.486263025829271e-05, + "loss": 0.9219, + "step": 930 + }, + { + "epoch": 0.34115060461707586, + "grad_norm": 1.4779146909713745, + "learning_rate": 1.4852530792214541e-05, + "loss": 0.8751, + "step": 931 + }, + { + "epoch": 0.3415170392085013, + "grad_norm": 1.670133352279663, + "learning_rate": 1.4842424848007526e-05, + "loss": 0.8551, + "step": 932 + }, + { + "epoch": 0.34188347379992673, + "grad_norm": 1.8766719102859497, + "learning_rate": 1.4832312439163095e-05, + "loss": 0.7814, + "step": 933 + }, + { + "epoch": 0.34224990839135216, + "grad_norm": 1.638991117477417, + "learning_rate": 1.4822193579181318e-05, + "loss": 0.8566, + "step": 934 + }, + { + "epoch": 0.3426163429827776, + "grad_norm": 1.517900824546814, + "learning_rate": 1.4812068281570871e-05, + "loss": 0.9234, + "step": 935 + }, + { + "epoch": 0.342982777574203, + "grad_norm": 1.4461380243301392, + "learning_rate": 1.4801936559849028e-05, + "loss": 0.9009, + "step": 936 + }, + { + "epoch": 0.3433492121656284, + "grad_norm": 1.518513560295105, + "learning_rate": 1.4791798427541644e-05, + "loss": 0.9789, + "step": 937 + }, + { + "epoch": 0.34371564675705385, + "grad_norm": 1.5770090818405151, + "learning_rate": 1.4781653898183116e-05, + "loss": 0.9328, + "step": 938 + }, + { + "epoch": 0.3440820813484793, + "grad_norm": 1.4077907800674438, + "learning_rate": 1.4771502985316395e-05, + "loss": 0.8984, + "step": 939 + }, + { + "epoch": 0.3444485159399047, + "grad_norm": 1.5162850618362427, + "learning_rate": 1.4761345702492955e-05, + "loss": 0.8382, + "step": 940 + }, + { + "epoch": 0.34481495053133016, + "grad_norm": 1.5433305501937866, + "learning_rate": 1.4751182063272763e-05, + "loss": 0.872, + "step": 941 + }, + { + "epoch": 0.3451813851227556, + "grad_norm": 1.393364429473877, + "learning_rate": 1.474101208122428e-05, + "loss": 0.9207, + "step": 942 + }, + { + "epoch": 0.345547819714181, + "grad_norm": 1.435567855834961, + "learning_rate": 1.4730835769924432e-05, + "loss": 0.8501, + "step": 943 + }, + { + "epoch": 0.34591425430560646, + "grad_norm": 1.5786954164505005, + "learning_rate": 1.4720653142958595e-05, + "loss": 0.8781, + "step": 944 + }, + { + "epoch": 0.3462806888970319, + "grad_norm": 1.5463298559188843, + "learning_rate": 1.4710464213920579e-05, + "loss": 0.858, + "step": 945 + }, + { + "epoch": 0.34664712348845733, + "grad_norm": 1.3941972255706787, + "learning_rate": 1.4700268996412601e-05, + "loss": 0.9108, + "step": 946 + }, + { + "epoch": 0.34701355807988277, + "grad_norm": 1.7360153198242188, + "learning_rate": 1.4690067504045282e-05, + "loss": 0.8832, + "step": 947 + }, + { + "epoch": 0.34737999267130815, + "grad_norm": 1.4550964832305908, + "learning_rate": 1.467985975043761e-05, + "loss": 0.9578, + "step": 948 + }, + { + "epoch": 0.3477464272627336, + "grad_norm": 1.9805035591125488, + "learning_rate": 1.4669645749216936e-05, + "loss": 0.8997, + "step": 949 + }, + { + "epoch": 0.348112861854159, + "grad_norm": 1.642874836921692, + "learning_rate": 1.4659425514018956e-05, + "loss": 0.8798, + "step": 950 + }, + { + "epoch": 0.34847929644558445, + "grad_norm": 1.7270559072494507, + "learning_rate": 1.4649199058487682e-05, + "loss": 0.8678, + "step": 951 + }, + { + "epoch": 0.3488457310370099, + "grad_norm": 1.448574185371399, + "learning_rate": 1.4638966396275434e-05, + "loss": 0.8721, + "step": 952 + }, + { + "epoch": 0.3492121656284353, + "grad_norm": 1.5349843502044678, + "learning_rate": 1.4628727541042815e-05, + "loss": 0.9147, + "step": 953 + }, + { + "epoch": 0.34957860021986076, + "grad_norm": 1.7318191528320312, + "learning_rate": 1.4618482506458702e-05, + "loss": 0.9098, + "step": 954 + }, + { + "epoch": 0.3499450348112862, + "grad_norm": 1.4772664308547974, + "learning_rate": 1.4608231306200211e-05, + "loss": 0.915, + "step": 955 + }, + { + "epoch": 0.35031146940271163, + "grad_norm": 1.7366101741790771, + "learning_rate": 1.4597973953952696e-05, + "loss": 0.855, + "step": 956 + }, + { + "epoch": 0.35067790399413706, + "grad_norm": 1.4249106645584106, + "learning_rate": 1.4587710463409721e-05, + "loss": 0.9456, + "step": 957 + }, + { + "epoch": 0.3510443385855625, + "grad_norm": 1.5543416738510132, + "learning_rate": 1.4577440848273052e-05, + "loss": 0.9116, + "step": 958 + }, + { + "epoch": 0.35141077317698793, + "grad_norm": 1.4858545064926147, + "learning_rate": 1.456716512225262e-05, + "loss": 0.8623, + "step": 959 + }, + { + "epoch": 0.3517772077684133, + "grad_norm": 1.6535437107086182, + "learning_rate": 1.4556883299066529e-05, + "loss": 0.8603, + "step": 960 + }, + { + "epoch": 0.35214364235983875, + "grad_norm": 1.3971456289291382, + "learning_rate": 1.4546595392441e-05, + "loss": 0.8705, + "step": 961 + }, + { + "epoch": 0.3525100769512642, + "grad_norm": 1.7072619199752808, + "learning_rate": 1.4536301416110398e-05, + "loss": 0.7356, + "step": 962 + }, + { + "epoch": 0.3528765115426896, + "grad_norm": 1.5533279180526733, + "learning_rate": 1.4526001383817182e-05, + "loss": 0.8684, + "step": 963 + }, + { + "epoch": 0.35324294613411505, + "grad_norm": 1.5234684944152832, + "learning_rate": 1.4515695309311887e-05, + "loss": 0.8985, + "step": 964 + }, + { + "epoch": 0.3536093807255405, + "grad_norm": 1.4784553050994873, + "learning_rate": 1.4505383206353133e-05, + "loss": 0.8635, + "step": 965 + }, + { + "epoch": 0.3539758153169659, + "grad_norm": 1.4752111434936523, + "learning_rate": 1.4495065088707576e-05, + "loss": 0.9312, + "step": 966 + }, + { + "epoch": 0.35434224990839136, + "grad_norm": 1.7725974321365356, + "learning_rate": 1.44847409701499e-05, + "loss": 0.9453, + "step": 967 + }, + { + "epoch": 0.3547086844998168, + "grad_norm": 1.5872156620025635, + "learning_rate": 1.4474410864462807e-05, + "loss": 0.8472, + "step": 968 + }, + { + "epoch": 0.35507511909124223, + "grad_norm": 1.4837459325790405, + "learning_rate": 1.4464074785436986e-05, + "loss": 0.8928, + "step": 969 + }, + { + "epoch": 0.35544155368266767, + "grad_norm": 1.5927033424377441, + "learning_rate": 1.44537327468711e-05, + "loss": 0.8754, + "step": 970 + }, + { + "epoch": 0.3558079882740931, + "grad_norm": 1.8709561824798584, + "learning_rate": 1.4443384762571781e-05, + "loss": 0.8707, + "step": 971 + }, + { + "epoch": 0.3561744228655185, + "grad_norm": 1.2936451435089111, + "learning_rate": 1.4433030846353582e-05, + "loss": 0.8956, + "step": 972 + }, + { + "epoch": 0.3565408574569439, + "grad_norm": 1.4189300537109375, + "learning_rate": 1.4422671012038982e-05, + "loss": 0.9132, + "step": 973 + }, + { + "epoch": 0.35690729204836935, + "grad_norm": 1.8624743223190308, + "learning_rate": 1.4412305273458361e-05, + "loss": 0.7831, + "step": 974 + }, + { + "epoch": 0.3572737266397948, + "grad_norm": 1.4298512935638428, + "learning_rate": 1.4401933644449977e-05, + "loss": 0.8054, + "step": 975 + }, + { + "epoch": 0.3576401612312202, + "grad_norm": 1.5367136001586914, + "learning_rate": 1.4391556138859962e-05, + "loss": 0.8131, + "step": 976 + }, + { + "epoch": 0.35800659582264566, + "grad_norm": 1.4892581701278687, + "learning_rate": 1.4381172770542283e-05, + "loss": 0.8886, + "step": 977 + }, + { + "epoch": 0.3583730304140711, + "grad_norm": 1.5779441595077515, + "learning_rate": 1.4370783553358735e-05, + "loss": 0.8548, + "step": 978 + }, + { + "epoch": 0.3587394650054965, + "grad_norm": 1.9342527389526367, + "learning_rate": 1.4360388501178925e-05, + "loss": 0.7853, + "step": 979 + }, + { + "epoch": 0.35910589959692196, + "grad_norm": 1.5269722938537598, + "learning_rate": 1.4349987627880251e-05, + "loss": 0.8765, + "step": 980 + }, + { + "epoch": 0.3594723341883474, + "grad_norm": 1.4872721433639526, + "learning_rate": 1.4339580947347877e-05, + "loss": 0.8588, + "step": 981 + }, + { + "epoch": 0.35983876877977283, + "grad_norm": 1.6207835674285889, + "learning_rate": 1.4329168473474722e-05, + "loss": 0.8713, + "step": 982 + }, + { + "epoch": 0.36020520337119827, + "grad_norm": 1.7222713232040405, + "learning_rate": 1.431875022016144e-05, + "loss": 0.9375, + "step": 983 + }, + { + "epoch": 0.36057163796262365, + "grad_norm": 1.682226300239563, + "learning_rate": 1.4308326201316399e-05, + "loss": 0.896, + "step": 984 + }, + { + "epoch": 0.3609380725540491, + "grad_norm": 1.8652957677841187, + "learning_rate": 1.4297896430855662e-05, + "loss": 0.826, + "step": 985 + }, + { + "epoch": 0.3613045071454745, + "grad_norm": 1.1483632326126099, + "learning_rate": 1.428746092270298e-05, + "loss": 0.5728, + "step": 986 + }, + { + "epoch": 0.36167094173689995, + "grad_norm": 1.5997368097305298, + "learning_rate": 1.4277019690789749e-05, + "loss": 0.9115, + "step": 987 + }, + { + "epoch": 0.3620373763283254, + "grad_norm": 1.8785408735275269, + "learning_rate": 1.4266572749055022e-05, + "loss": 0.8324, + "step": 988 + }, + { + "epoch": 0.3624038109197508, + "grad_norm": 1.4323786497116089, + "learning_rate": 1.425612011144546e-05, + "loss": 0.9204, + "step": 989 + }, + { + "epoch": 0.36277024551117626, + "grad_norm": 1.4402960538864136, + "learning_rate": 1.4245661791915336e-05, + "loss": 0.9272, + "step": 990 + }, + { + "epoch": 0.3631366801026017, + "grad_norm": 1.5204726457595825, + "learning_rate": 1.423519780442651e-05, + "loss": 0.8778, + "step": 991 + }, + { + "epoch": 0.36350311469402713, + "grad_norm": 1.6032034158706665, + "learning_rate": 1.4224728162948403e-05, + "loss": 0.8389, + "step": 992 + }, + { + "epoch": 0.36386954928545256, + "grad_norm": 2.3969008922576904, + "learning_rate": 1.4214252881457987e-05, + "loss": 0.7944, + "step": 993 + }, + { + "epoch": 0.364235983876878, + "grad_norm": 1.8171178102493286, + "learning_rate": 1.4203771973939769e-05, + "loss": 0.8405, + "step": 994 + }, + { + "epoch": 0.36460241846830344, + "grad_norm": 1.6910125017166138, + "learning_rate": 1.4193285454385755e-05, + "loss": 0.8838, + "step": 995 + }, + { + "epoch": 0.3649688530597288, + "grad_norm": 1.770917534828186, + "learning_rate": 1.4182793336795446e-05, + "loss": 0.7731, + "step": 996 + }, + { + "epoch": 0.36533528765115425, + "grad_norm": 1.5590789318084717, + "learning_rate": 1.417229563517583e-05, + "loss": 0.8358, + "step": 997 + }, + { + "epoch": 0.3657017222425797, + "grad_norm": 1.6439181566238403, + "learning_rate": 1.4161792363541329e-05, + "loss": 0.8601, + "step": 998 + }, + { + "epoch": 0.3660681568340051, + "grad_norm": 1.4935253858566284, + "learning_rate": 1.4151283535913819e-05, + "loss": 0.8239, + "step": 999 + }, + { + "epoch": 0.36643459142543056, + "grad_norm": 1.662537932395935, + "learning_rate": 1.414076916632258e-05, + "loss": 0.901, + "step": 1000 + }, + { + "epoch": 0.366801026016856, + "grad_norm": 1.5783257484436035, + "learning_rate": 1.4130249268804296e-05, + "loss": 0.7823, + "step": 1001 + }, + { + "epoch": 0.3671674606082814, + "grad_norm": 1.5855540037155151, + "learning_rate": 1.4119723857403039e-05, + "loss": 0.9523, + "step": 1002 + }, + { + "epoch": 0.36753389519970686, + "grad_norm": 1.349881887435913, + "learning_rate": 1.410919294617022e-05, + "loss": 0.8282, + "step": 1003 + }, + { + "epoch": 0.3679003297911323, + "grad_norm": 1.4568612575531006, + "learning_rate": 1.4098656549164615e-05, + "loss": 0.9527, + "step": 1004 + }, + { + "epoch": 0.36826676438255773, + "grad_norm": 1.461704969406128, + "learning_rate": 1.4088114680452314e-05, + "loss": 0.9008, + "step": 1005 + }, + { + "epoch": 0.36863319897398317, + "grad_norm": 1.5116150379180908, + "learning_rate": 1.4077567354106711e-05, + "loss": 0.8853, + "step": 1006 + }, + { + "epoch": 0.36899963356540855, + "grad_norm": 1.7470734119415283, + "learning_rate": 1.406701458420849e-05, + "loss": 0.8858, + "step": 1007 + }, + { + "epoch": 0.369366068156834, + "grad_norm": 1.6353769302368164, + "learning_rate": 1.4056456384845592e-05, + "loss": 0.8338, + "step": 1008 + }, + { + "epoch": 0.3697325027482594, + "grad_norm": 1.7861933708190918, + "learning_rate": 1.4045892770113222e-05, + "loss": 0.8883, + "step": 1009 + }, + { + "epoch": 0.37009893733968485, + "grad_norm": 1.5234062671661377, + "learning_rate": 1.40353237541138e-05, + "loss": 0.9258, + "step": 1010 + }, + { + "epoch": 0.3704653719311103, + "grad_norm": 1.6214821338653564, + "learning_rate": 1.4024749350956965e-05, + "loss": 0.8274, + "step": 1011 + }, + { + "epoch": 0.3708318065225357, + "grad_norm": 1.4756487607955933, + "learning_rate": 1.4014169574759548e-05, + "loss": 0.8759, + "step": 1012 + }, + { + "epoch": 0.37119824111396116, + "grad_norm": 1.4953577518463135, + "learning_rate": 1.4003584439645545e-05, + "loss": 0.935, + "step": 1013 + }, + { + "epoch": 0.3715646757053866, + "grad_norm": 1.6752123832702637, + "learning_rate": 1.399299395974612e-05, + "loss": 0.9121, + "step": 1014 + }, + { + "epoch": 0.37193111029681203, + "grad_norm": 1.8773505687713623, + "learning_rate": 1.398239814919956e-05, + "loss": 0.8101, + "step": 1015 + }, + { + "epoch": 0.37229754488823746, + "grad_norm": 1.6290862560272217, + "learning_rate": 1.3971797022151268e-05, + "loss": 0.9078, + "step": 1016 + }, + { + "epoch": 0.3726639794796629, + "grad_norm": 1.563781976699829, + "learning_rate": 1.3961190592753757e-05, + "loss": 0.877, + "step": 1017 + }, + { + "epoch": 0.37303041407108833, + "grad_norm": 1.7269985675811768, + "learning_rate": 1.3950578875166608e-05, + "loss": 0.8254, + "step": 1018 + }, + { + "epoch": 0.3733968486625137, + "grad_norm": 1.568712592124939, + "learning_rate": 1.3939961883556461e-05, + "loss": 0.8323, + "step": 1019 + }, + { + "epoch": 0.37376328325393915, + "grad_norm": 1.534967064857483, + "learning_rate": 1.3929339632097008e-05, + "loss": 0.8769, + "step": 1020 + }, + { + "epoch": 0.3741297178453646, + "grad_norm": 1.5710378885269165, + "learning_rate": 1.3918712134968946e-05, + "loss": 0.8604, + "step": 1021 + }, + { + "epoch": 0.37449615243679, + "grad_norm": 1.882443904876709, + "learning_rate": 1.3908079406359991e-05, + "loss": 0.8033, + "step": 1022 + }, + { + "epoch": 0.37486258702821545, + "grad_norm": 1.520538568496704, + "learning_rate": 1.3897441460464834e-05, + "loss": 0.9032, + "step": 1023 + }, + { + "epoch": 0.3752290216196409, + "grad_norm": 1.724283218383789, + "learning_rate": 1.3886798311485133e-05, + "loss": 0.8723, + "step": 1024 + }, + { + "epoch": 0.3755954562110663, + "grad_norm": 1.5896741151809692, + "learning_rate": 1.3876149973629492e-05, + "loss": 0.7985, + "step": 1025 + }, + { + "epoch": 0.37596189080249176, + "grad_norm": 1.8196618556976318, + "learning_rate": 1.386549646111344e-05, + "loss": 0.701, + "step": 1026 + }, + { + "epoch": 0.3763283253939172, + "grad_norm": 1.603785514831543, + "learning_rate": 1.3854837788159424e-05, + "loss": 0.8844, + "step": 1027 + }, + { + "epoch": 0.37669475998534263, + "grad_norm": 1.6494451761245728, + "learning_rate": 1.384417396899677e-05, + "loss": 0.8592, + "step": 1028 + }, + { + "epoch": 0.37706119457676807, + "grad_norm": 1.7041468620300293, + "learning_rate": 1.3833505017861674e-05, + "loss": 0.8691, + "step": 1029 + }, + { + "epoch": 0.3774276291681935, + "grad_norm": 1.751847267150879, + "learning_rate": 1.382283094899719e-05, + "loss": 0.8117, + "step": 1030 + }, + { + "epoch": 0.3777940637596189, + "grad_norm": 1.5712251663208008, + "learning_rate": 1.3812151776653195e-05, + "loss": 0.8617, + "step": 1031 + }, + { + "epoch": 0.3781604983510443, + "grad_norm": 1.6731207370758057, + "learning_rate": 1.3801467515086391e-05, + "loss": 0.8124, + "step": 1032 + }, + { + "epoch": 0.37852693294246975, + "grad_norm": 1.4233975410461426, + "learning_rate": 1.3790778178560266e-05, + "loss": 0.8663, + "step": 1033 + }, + { + "epoch": 0.3788933675338952, + "grad_norm": 1.6427613496780396, + "learning_rate": 1.3780083781345084e-05, + "loss": 0.9296, + "step": 1034 + }, + { + "epoch": 0.3792598021253206, + "grad_norm": 1.4288830757141113, + "learning_rate": 1.3769384337717864e-05, + "loss": 0.8394, + "step": 1035 + }, + { + "epoch": 0.37962623671674606, + "grad_norm": 1.548366904258728, + "learning_rate": 1.3758679861962366e-05, + "loss": 0.8548, + "step": 1036 + }, + { + "epoch": 0.3799926713081715, + "grad_norm": 1.6227176189422607, + "learning_rate": 1.3747970368369063e-05, + "loss": 0.8663, + "step": 1037 + }, + { + "epoch": 0.3803591058995969, + "grad_norm": 1.6395370960235596, + "learning_rate": 1.3737255871235133e-05, + "loss": 0.7948, + "step": 1038 + }, + { + "epoch": 0.38072554049102236, + "grad_norm": 1.5784190893173218, + "learning_rate": 1.372653638486443e-05, + "loss": 0.8652, + "step": 1039 + }, + { + "epoch": 0.3810919750824478, + "grad_norm": 1.3747224807739258, + "learning_rate": 1.3715811923567468e-05, + "loss": 0.8831, + "step": 1040 + }, + { + "epoch": 0.38145840967387323, + "grad_norm": 1.5469486713409424, + "learning_rate": 1.3705082501661402e-05, + "loss": 0.8871, + "step": 1041 + }, + { + "epoch": 0.38182484426529867, + "grad_norm": 1.5060057640075684, + "learning_rate": 1.369434813347001e-05, + "loss": 0.907, + "step": 1042 + }, + { + "epoch": 0.38219127885672405, + "grad_norm": 1.5088938474655151, + "learning_rate": 1.368360883332368e-05, + "loss": 0.8994, + "step": 1043 + }, + { + "epoch": 0.3825577134481495, + "grad_norm": 1.589644432067871, + "learning_rate": 1.3672864615559372e-05, + "loss": 0.8767, + "step": 1044 + }, + { + "epoch": 0.3829241480395749, + "grad_norm": 1.5223177671432495, + "learning_rate": 1.3662115494520618e-05, + "loss": 0.9506, + "step": 1045 + }, + { + "epoch": 0.38329058263100035, + "grad_norm": 1.5508954524993896, + "learning_rate": 1.36513614845575e-05, + "loss": 0.9139, + "step": 1046 + }, + { + "epoch": 0.3836570172224258, + "grad_norm": 1.9155925512313843, + "learning_rate": 1.3640602600026613e-05, + "loss": 0.7311, + "step": 1047 + }, + { + "epoch": 0.3840234518138512, + "grad_norm": 1.6248713731765747, + "learning_rate": 1.3629838855291078e-05, + "loss": 0.8691, + "step": 1048 + }, + { + "epoch": 0.38438988640527666, + "grad_norm": 1.4297411441802979, + "learning_rate": 1.3619070264720489e-05, + "loss": 0.8643, + "step": 1049 + }, + { + "epoch": 0.3847563209967021, + "grad_norm": 1.5291481018066406, + "learning_rate": 1.3608296842690916e-05, + "loss": 0.8865, + "step": 1050 + }, + { + "epoch": 0.38512275558812753, + "grad_norm": 1.5512865781784058, + "learning_rate": 1.3597518603584878e-05, + "loss": 0.8837, + "step": 1051 + }, + { + "epoch": 0.38548919017955297, + "grad_norm": 1.4452577829360962, + "learning_rate": 1.3586735561791327e-05, + "loss": 0.9034, + "step": 1052 + }, + { + "epoch": 0.3858556247709784, + "grad_norm": 1.3865025043487549, + "learning_rate": 1.3575947731705624e-05, + "loss": 0.8562, + "step": 1053 + }, + { + "epoch": 0.38622205936240384, + "grad_norm": 1.528044581413269, + "learning_rate": 1.3565155127729516e-05, + "loss": 0.9205, + "step": 1054 + }, + { + "epoch": 0.3865884939538292, + "grad_norm": 1.539243221282959, + "learning_rate": 1.3554357764271138e-05, + "loss": 0.8596, + "step": 1055 + }, + { + "epoch": 0.38695492854525465, + "grad_norm": 1.7668794393539429, + "learning_rate": 1.3543555655744966e-05, + "loss": 0.8194, + "step": 1056 + }, + { + "epoch": 0.3873213631366801, + "grad_norm": 1.5567411184310913, + "learning_rate": 1.3532748816571821e-05, + "loss": 0.9152, + "step": 1057 + }, + { + "epoch": 0.3876877977281055, + "grad_norm": 1.575163722038269, + "learning_rate": 1.3521937261178826e-05, + "loss": 0.87, + "step": 1058 + }, + { + "epoch": 0.38805423231953096, + "grad_norm": 1.6608469486236572, + "learning_rate": 1.3511121003999414e-05, + "loss": 0.8094, + "step": 1059 + }, + { + "epoch": 0.3884206669109564, + "grad_norm": 1.887389063835144, + "learning_rate": 1.3500300059473285e-05, + "loss": 0.7911, + "step": 1060 + }, + { + "epoch": 0.3887871015023818, + "grad_norm": 1.8042851686477661, + "learning_rate": 1.3489474442046404e-05, + "loss": 0.8217, + "step": 1061 + }, + { + "epoch": 0.38915353609380726, + "grad_norm": 1.6864506006240845, + "learning_rate": 1.3478644166170968e-05, + "loss": 0.9135, + "step": 1062 + }, + { + "epoch": 0.3895199706852327, + "grad_norm": 1.9867236614227295, + "learning_rate": 1.3467809246305398e-05, + "loss": 0.8669, + "step": 1063 + }, + { + "epoch": 0.38988640527665813, + "grad_norm": 1.8301372528076172, + "learning_rate": 1.3456969696914313e-05, + "loss": 0.813, + "step": 1064 + }, + { + "epoch": 0.39025283986808357, + "grad_norm": 1.6149121522903442, + "learning_rate": 1.3446125532468507e-05, + "loss": 0.8958, + "step": 1065 + }, + { + "epoch": 0.390619274459509, + "grad_norm": 1.5697036981582642, + "learning_rate": 1.3435276767444946e-05, + "loss": 0.921, + "step": 1066 + }, + { + "epoch": 0.3909857090509344, + "grad_norm": 1.525789499282837, + "learning_rate": 1.3424423416326727e-05, + "loss": 0.8502, + "step": 1067 + }, + { + "epoch": 0.3913521436423598, + "grad_norm": 1.7171850204467773, + "learning_rate": 1.341356549360308e-05, + "loss": 0.7838, + "step": 1068 + }, + { + "epoch": 0.39171857823378525, + "grad_norm": 1.521950364112854, + "learning_rate": 1.3402703013769326e-05, + "loss": 0.8726, + "step": 1069 + }, + { + "epoch": 0.3920850128252107, + "grad_norm": 1.6256566047668457, + "learning_rate": 1.3391835991326882e-05, + "loss": 0.954, + "step": 1070 + }, + { + "epoch": 0.3924514474166361, + "grad_norm": 1.6131467819213867, + "learning_rate": 1.338096444078322e-05, + "loss": 0.884, + "step": 1071 + }, + { + "epoch": 0.39281788200806156, + "grad_norm": 1.5478419065475464, + "learning_rate": 1.337008837665186e-05, + "loss": 0.8749, + "step": 1072 + }, + { + "epoch": 0.393184316599487, + "grad_norm": 1.5852874517440796, + "learning_rate": 1.3359207813452348e-05, + "loss": 0.8507, + "step": 1073 + }, + { + "epoch": 0.39355075119091243, + "grad_norm": 1.6789813041687012, + "learning_rate": 1.334832276571024e-05, + "loss": 0.8886, + "step": 1074 + }, + { + "epoch": 0.39391718578233786, + "grad_norm": 1.699114203453064, + "learning_rate": 1.3337433247957075e-05, + "loss": 0.8698, + "step": 1075 + }, + { + "epoch": 0.3942836203737633, + "grad_norm": 1.515486240386963, + "learning_rate": 1.3326539274730354e-05, + "loss": 0.8625, + "step": 1076 + }, + { + "epoch": 0.39465005496518873, + "grad_norm": 1.5709253549575806, + "learning_rate": 1.3315640860573537e-05, + "loss": 0.8617, + "step": 1077 + }, + { + "epoch": 0.39501648955661417, + "grad_norm": 1.5914605855941772, + "learning_rate": 1.3304738020036006e-05, + "loss": 0.8855, + "step": 1078 + }, + { + "epoch": 0.39538292414803955, + "grad_norm": 1.4676405191421509, + "learning_rate": 1.3293830767673055e-05, + "loss": 0.8757, + "step": 1079 + }, + { + "epoch": 0.395749358739465, + "grad_norm": 1.7750840187072754, + "learning_rate": 1.3282919118045866e-05, + "loss": 0.8586, + "step": 1080 + }, + { + "epoch": 0.3961157933308904, + "grad_norm": 1.5571708679199219, + "learning_rate": 1.3272003085721485e-05, + "loss": 0.9255, + "step": 1081 + }, + { + "epoch": 0.39648222792231586, + "grad_norm": 1.6908925771713257, + "learning_rate": 1.3261082685272827e-05, + "loss": 0.8494, + "step": 1082 + }, + { + "epoch": 0.3968486625137413, + "grad_norm": 1.520818829536438, + "learning_rate": 1.325015793127862e-05, + "loss": 0.8573, + "step": 1083 + }, + { + "epoch": 0.3972150971051667, + "grad_norm": 1.4602961540222168, + "learning_rate": 1.3239228838323414e-05, + "loss": 0.9259, + "step": 1084 + }, + { + "epoch": 0.39758153169659216, + "grad_norm": 1.6277791261672974, + "learning_rate": 1.3228295420997546e-05, + "loss": 0.8911, + "step": 1085 + }, + { + "epoch": 0.3979479662880176, + "grad_norm": 1.7978880405426025, + "learning_rate": 1.3217357693897133e-05, + "loss": 0.9046, + "step": 1086 + }, + { + "epoch": 0.39831440087944303, + "grad_norm": 1.6153454780578613, + "learning_rate": 1.3206415671624036e-05, + "loss": 0.8602, + "step": 1087 + }, + { + "epoch": 0.39868083547086847, + "grad_norm": 1.5234719514846802, + "learning_rate": 1.3195469368785858e-05, + "loss": 0.9155, + "step": 1088 + }, + { + "epoch": 0.3990472700622939, + "grad_norm": 1.5738810300827026, + "learning_rate": 1.3184518799995912e-05, + "loss": 0.853, + "step": 1089 + }, + { + "epoch": 0.39941370465371934, + "grad_norm": 1.6827136278152466, + "learning_rate": 1.3173563979873212e-05, + "loss": 0.9015, + "step": 1090 + }, + { + "epoch": 0.3997801392451447, + "grad_norm": 1.7982146739959717, + "learning_rate": 1.316260492304244e-05, + "loss": 0.8608, + "step": 1091 + }, + { + "epoch": 0.40014657383657015, + "grad_norm": 1.711007833480835, + "learning_rate": 1.3151641644133942e-05, + "loss": 0.8817, + "step": 1092 + }, + { + "epoch": 0.4005130084279956, + "grad_norm": 1.7627228498458862, + "learning_rate": 1.3140674157783686e-05, + "loss": 0.8585, + "step": 1093 + }, + { + "epoch": 0.400879443019421, + "grad_norm": 1.6841421127319336, + "learning_rate": 1.3129702478633275e-05, + "loss": 0.8468, + "step": 1094 + }, + { + "epoch": 0.40124587761084646, + "grad_norm": 1.6395200490951538, + "learning_rate": 1.31187266213299e-05, + "loss": 0.91, + "step": 1095 + }, + { + "epoch": 0.4016123122022719, + "grad_norm": 1.6827080249786377, + "learning_rate": 1.310774660052633e-05, + "loss": 0.8466, + "step": 1096 + }, + { + "epoch": 0.40197874679369733, + "grad_norm": 1.5233503580093384, + "learning_rate": 1.3096762430880894e-05, + "loss": 0.9055, + "step": 1097 + }, + { + "epoch": 0.40234518138512276, + "grad_norm": 1.6009891033172607, + "learning_rate": 1.308577412705746e-05, + "loss": 0.8622, + "step": 1098 + }, + { + "epoch": 0.4027116159765482, + "grad_norm": 1.6245487928390503, + "learning_rate": 1.307478170372541e-05, + "loss": 0.8656, + "step": 1099 + }, + { + "epoch": 0.40307805056797363, + "grad_norm": 1.6522853374481201, + "learning_rate": 1.3063785175559634e-05, + "loss": 0.9106, + "step": 1100 + }, + { + "epoch": 0.40344448515939907, + "grad_norm": 1.8431390523910522, + "learning_rate": 1.3052784557240495e-05, + "loss": 0.8102, + "step": 1101 + }, + { + "epoch": 0.4038109197508245, + "grad_norm": 1.626137614250183, + "learning_rate": 1.3041779863453821e-05, + "loss": 0.8664, + "step": 1102 + }, + { + "epoch": 0.4041773543422499, + "grad_norm": 1.595874547958374, + "learning_rate": 1.3030771108890878e-05, + "loss": 0.8705, + "step": 1103 + }, + { + "epoch": 0.4045437889336753, + "grad_norm": 1.4765444993972778, + "learning_rate": 1.3019758308248351e-05, + "loss": 0.8728, + "step": 1104 + }, + { + "epoch": 0.40491022352510075, + "grad_norm": 1.9476228952407837, + "learning_rate": 1.3008741476228335e-05, + "loss": 0.849, + "step": 1105 + }, + { + "epoch": 0.4052766581165262, + "grad_norm": 1.7913713455200195, + "learning_rate": 1.2997720627538297e-05, + "loss": 0.8579, + "step": 1106 + }, + { + "epoch": 0.4056430927079516, + "grad_norm": 1.8295342922210693, + "learning_rate": 1.2986695776891072e-05, + "loss": 0.8795, + "step": 1107 + }, + { + "epoch": 0.40600952729937706, + "grad_norm": 1.7225682735443115, + "learning_rate": 1.2975666939004839e-05, + "loss": 0.8328, + "step": 1108 + }, + { + "epoch": 0.4063759618908025, + "grad_norm": 1.6390389204025269, + "learning_rate": 1.2964634128603092e-05, + "loss": 0.869, + "step": 1109 + }, + { + "epoch": 0.40674239648222793, + "grad_norm": 1.4864574670791626, + "learning_rate": 1.2953597360414636e-05, + "loss": 0.9074, + "step": 1110 + }, + { + "epoch": 0.40710883107365337, + "grad_norm": 1.5992265939712524, + "learning_rate": 1.2942556649173557e-05, + "loss": 0.8928, + "step": 1111 + }, + { + "epoch": 0.4074752656650788, + "grad_norm": 1.5651394128799438, + "learning_rate": 1.2931512009619202e-05, + "loss": 0.8772, + "step": 1112 + }, + { + "epoch": 0.40784170025650424, + "grad_norm": 1.669217586517334, + "learning_rate": 1.292046345649617e-05, + "loss": 0.9396, + "step": 1113 + }, + { + "epoch": 0.40820813484792967, + "grad_norm": 1.7099789381027222, + "learning_rate": 1.290941100455428e-05, + "loss": 0.8651, + "step": 1114 + }, + { + "epoch": 0.40857456943935505, + "grad_norm": 1.6064772605895996, + "learning_rate": 1.2898354668548554e-05, + "loss": 0.9055, + "step": 1115 + }, + { + "epoch": 0.4089410040307805, + "grad_norm": 1.5557650327682495, + "learning_rate": 1.28872944632392e-05, + "loss": 0.8809, + "step": 1116 + }, + { + "epoch": 0.4093074386222059, + "grad_norm": 1.7314295768737793, + "learning_rate": 1.2876230403391592e-05, + "loss": 0.8917, + "step": 1117 + }, + { + "epoch": 0.40967387321363136, + "grad_norm": 1.8550328016281128, + "learning_rate": 1.2865162503776257e-05, + "loss": 0.9015, + "step": 1118 + }, + { + "epoch": 0.4100403078050568, + "grad_norm": 1.5810977220535278, + "learning_rate": 1.285409077916883e-05, + "loss": 0.8637, + "step": 1119 + }, + { + "epoch": 0.4104067423964822, + "grad_norm": 1.6856402158737183, + "learning_rate": 1.2843015244350077e-05, + "loss": 0.8522, + "step": 1120 + }, + { + "epoch": 0.41077317698790766, + "grad_norm": 1.704975962638855, + "learning_rate": 1.2831935914105831e-05, + "loss": 0.8145, + "step": 1121 + }, + { + "epoch": 0.4111396115793331, + "grad_norm": 1.7331234216690063, + "learning_rate": 1.2820852803226993e-05, + "loss": 0.9148, + "step": 1122 + }, + { + "epoch": 0.41150604617075853, + "grad_norm": 1.8245302438735962, + "learning_rate": 1.2809765926509527e-05, + "loss": 0.8974, + "step": 1123 + }, + { + "epoch": 0.41187248076218397, + "grad_norm": 1.6337167024612427, + "learning_rate": 1.2798675298754409e-05, + "loss": 0.8307, + "step": 1124 + }, + { + "epoch": 0.4122389153536094, + "grad_norm": 1.7906094789505005, + "learning_rate": 1.2787580934767625e-05, + "loss": 0.7789, + "step": 1125 + }, + { + "epoch": 0.41260534994503484, + "grad_norm": 1.7973666191101074, + "learning_rate": 1.277648284936016e-05, + "loss": 0.8781, + "step": 1126 + }, + { + "epoch": 0.4129717845364602, + "grad_norm": 1.5800189971923828, + "learning_rate": 1.2765381057347945e-05, + "loss": 0.8426, + "step": 1127 + }, + { + "epoch": 0.41333821912788565, + "grad_norm": 1.8464257717132568, + "learning_rate": 1.2754275573551886e-05, + "loss": 0.9326, + "step": 1128 + }, + { + "epoch": 0.4137046537193111, + "grad_norm": 1.7335541248321533, + "learning_rate": 1.2743166412797799e-05, + "loss": 0.7967, + "step": 1129 + }, + { + "epoch": 0.4140710883107365, + "grad_norm": 1.535427212715149, + "learning_rate": 1.273205358991641e-05, + "loss": 0.9106, + "step": 1130 + }, + { + "epoch": 0.41443752290216196, + "grad_norm": 1.7636157274246216, + "learning_rate": 1.2720937119743347e-05, + "loss": 0.8137, + "step": 1131 + }, + { + "epoch": 0.4148039574935874, + "grad_norm": 1.6306973695755005, + "learning_rate": 1.2709817017119093e-05, + "loss": 0.8591, + "step": 1132 + }, + { + "epoch": 0.41517039208501283, + "grad_norm": 1.6848173141479492, + "learning_rate": 1.2698693296888983e-05, + "loss": 0.8442, + "step": 1133 + }, + { + "epoch": 0.41553682667643826, + "grad_norm": 1.9436910152435303, + "learning_rate": 1.2687565973903193e-05, + "loss": 0.8002, + "step": 1134 + }, + { + "epoch": 0.4159032612678637, + "grad_norm": 1.861436128616333, + "learning_rate": 1.2676435063016688e-05, + "loss": 0.8841, + "step": 1135 + }, + { + "epoch": 0.41626969585928913, + "grad_norm": 1.7975494861602783, + "learning_rate": 1.2665300579089245e-05, + "loss": 0.8786, + "step": 1136 + }, + { + "epoch": 0.41663613045071457, + "grad_norm": 1.812900185585022, + "learning_rate": 1.2654162536985393e-05, + "loss": 0.8586, + "step": 1137 + }, + { + "epoch": 0.41700256504214, + "grad_norm": 1.8576949834823608, + "learning_rate": 1.2643020951574425e-05, + "loss": 0.8443, + "step": 1138 + }, + { + "epoch": 0.4173689996335654, + "grad_norm": 1.931456208229065, + "learning_rate": 1.2631875837730355e-05, + "loss": 0.8383, + "step": 1139 + }, + { + "epoch": 0.4177354342249908, + "grad_norm": 1.7419774532318115, + "learning_rate": 1.262072721033191e-05, + "loss": 0.8625, + "step": 1140 + }, + { + "epoch": 0.41810186881641626, + "grad_norm": 1.7812283039093018, + "learning_rate": 1.2609575084262506e-05, + "loss": 0.864, + "step": 1141 + }, + { + "epoch": 0.4184683034078417, + "grad_norm": 1.7734988927841187, + "learning_rate": 1.2598419474410238e-05, + "loss": 0.8285, + "step": 1142 + }, + { + "epoch": 0.4188347379992671, + "grad_norm": 1.7850379943847656, + "learning_rate": 1.2587260395667839e-05, + "loss": 0.8713, + "step": 1143 + }, + { + "epoch": 0.41920117259069256, + "grad_norm": 1.765945315361023, + "learning_rate": 1.2576097862932687e-05, + "loss": 0.8273, + "step": 1144 + }, + { + "epoch": 0.419567607182118, + "grad_norm": 1.604203701019287, + "learning_rate": 1.2564931891106755e-05, + "loss": 0.8301, + "step": 1145 + }, + { + "epoch": 0.41993404177354343, + "grad_norm": 1.6468043327331543, + "learning_rate": 1.2553762495096624e-05, + "loss": 0.9061, + "step": 1146 + }, + { + "epoch": 0.42030047636496887, + "grad_norm": 1.7401213645935059, + "learning_rate": 1.2542589689813433e-05, + "loss": 0.8487, + "step": 1147 + }, + { + "epoch": 0.4206669109563943, + "grad_norm": 1.7834421396255493, + "learning_rate": 1.2531413490172882e-05, + "loss": 0.8003, + "step": 1148 + }, + { + "epoch": 0.42103334554781974, + "grad_norm": 1.6583036184310913, + "learning_rate": 1.2520233911095194e-05, + "loss": 0.8486, + "step": 1149 + }, + { + "epoch": 0.4213997801392451, + "grad_norm": 1.8208075761795044, + "learning_rate": 1.250905096750511e-05, + "loss": 0.8591, + "step": 1150 + }, + { + "epoch": 0.42176621473067055, + "grad_norm": 1.756757140159607, + "learning_rate": 1.2497864674331858e-05, + "loss": 0.9033, + "step": 1151 + }, + { + "epoch": 0.422132649322096, + "grad_norm": 1.716612458229065, + "learning_rate": 1.2486675046509143e-05, + "loss": 0.8855, + "step": 1152 + }, + { + "epoch": 0.4224990839135214, + "grad_norm": 1.66440749168396, + "learning_rate": 1.2475482098975116e-05, + "loss": 0.8667, + "step": 1153 + }, + { + "epoch": 0.42286551850494686, + "grad_norm": 1.9110910892486572, + "learning_rate": 1.2464285846672365e-05, + "loss": 0.8465, + "step": 1154 + }, + { + "epoch": 0.4232319530963723, + "grad_norm": 1.6943145990371704, + "learning_rate": 1.2453086304547885e-05, + "loss": 0.8959, + "step": 1155 + }, + { + "epoch": 0.42359838768779773, + "grad_norm": 1.7550324201583862, + "learning_rate": 1.2441883487553066e-05, + "loss": 0.8589, + "step": 1156 + }, + { + "epoch": 0.42396482227922316, + "grad_norm": 1.9641809463500977, + "learning_rate": 1.243067741064367e-05, + "loss": 0.8543, + "step": 1157 + }, + { + "epoch": 0.4243312568706486, + "grad_norm": 1.8191912174224854, + "learning_rate": 1.241946808877981e-05, + "loss": 0.8622, + "step": 1158 + }, + { + "epoch": 0.42469769146207403, + "grad_norm": 1.6547656059265137, + "learning_rate": 1.240825553692593e-05, + "loss": 0.8519, + "step": 1159 + }, + { + "epoch": 0.42506412605349947, + "grad_norm": 1.656889796257019, + "learning_rate": 1.2397039770050787e-05, + "loss": 0.9292, + "step": 1160 + }, + { + "epoch": 0.4254305606449249, + "grad_norm": 1.8495047092437744, + "learning_rate": 1.2385820803127436e-05, + "loss": 0.8511, + "step": 1161 + }, + { + "epoch": 0.4257969952363503, + "grad_norm": 1.9155131578445435, + "learning_rate": 1.2374598651133191e-05, + "loss": 0.8477, + "step": 1162 + }, + { + "epoch": 0.4261634298277757, + "grad_norm": 1.7109763622283936, + "learning_rate": 1.2363373329049629e-05, + "loss": 0.8628, + "step": 1163 + }, + { + "epoch": 0.42652986441920115, + "grad_norm": 1.6846760511398315, + "learning_rate": 1.2352144851862553e-05, + "loss": 0.8688, + "step": 1164 + }, + { + "epoch": 0.4268962990106266, + "grad_norm": 1.8518428802490234, + "learning_rate": 1.2340913234561986e-05, + "loss": 0.8429, + "step": 1165 + }, + { + "epoch": 0.427262733602052, + "grad_norm": 1.9886471033096313, + "learning_rate": 1.2329678492142134e-05, + "loss": 0.7325, + "step": 1166 + }, + { + "epoch": 0.42762916819347746, + "grad_norm": 1.7306740283966064, + "learning_rate": 1.2318440639601378e-05, + "loss": 0.8877, + "step": 1167 + }, + { + "epoch": 0.4279956027849029, + "grad_norm": 2.007984161376953, + "learning_rate": 1.2307199691942252e-05, + "loss": 0.7804, + "step": 1168 + }, + { + "epoch": 0.42836203737632833, + "grad_norm": 1.6732981204986572, + "learning_rate": 1.2295955664171421e-05, + "loss": 0.8736, + "step": 1169 + }, + { + "epoch": 0.42872847196775377, + "grad_norm": 1.597426414489746, + "learning_rate": 1.2284708571299669e-05, + "loss": 0.8723, + "step": 1170 + }, + { + "epoch": 0.4290949065591792, + "grad_norm": 1.7663847208023071, + "learning_rate": 1.2273458428341856e-05, + "loss": 0.9053, + "step": 1171 + }, + { + "epoch": 0.42946134115060464, + "grad_norm": 1.6893157958984375, + "learning_rate": 1.226220525031693e-05, + "loss": 0.8663, + "step": 1172 + }, + { + "epoch": 0.42982777574203007, + "grad_norm": 1.6163278818130493, + "learning_rate": 1.2250949052247883e-05, + "loss": 0.9114, + "step": 1173 + }, + { + "epoch": 0.43019421033345545, + "grad_norm": 1.7489039897918701, + "learning_rate": 1.2239689849161738e-05, + "loss": 0.8597, + "step": 1174 + }, + { + "epoch": 0.4305606449248809, + "grad_norm": 1.7178845405578613, + "learning_rate": 1.2228427656089532e-05, + "loss": 0.8663, + "step": 1175 + }, + { + "epoch": 0.4309270795163063, + "grad_norm": 1.952564001083374, + "learning_rate": 1.2217162488066295e-05, + "loss": 0.8371, + "step": 1176 + }, + { + "epoch": 0.43129351410773176, + "grad_norm": 1.9330706596374512, + "learning_rate": 1.2205894360131026e-05, + "loss": 0.863, + "step": 1177 + }, + { + "epoch": 0.4316599486991572, + "grad_norm": 1.6406487226486206, + "learning_rate": 1.2194623287326679e-05, + "loss": 0.9078, + "step": 1178 + }, + { + "epoch": 0.4320263832905826, + "grad_norm": 1.7578526735305786, + "learning_rate": 1.218334928470013e-05, + "loss": 0.7632, + "step": 1179 + }, + { + "epoch": 0.43239281788200806, + "grad_norm": 1.9374595880508423, + "learning_rate": 1.2172072367302178e-05, + "loss": 0.7159, + "step": 1180 + }, + { + "epoch": 0.4327592524734335, + "grad_norm": 1.8487389087677002, + "learning_rate": 1.2160792550187509e-05, + "loss": 0.8132, + "step": 1181 + }, + { + "epoch": 0.43312568706485893, + "grad_norm": 2.047722816467285, + "learning_rate": 1.2149509848414676e-05, + "loss": 0.7963, + "step": 1182 + }, + { + "epoch": 0.43349212165628437, + "grad_norm": 1.8400959968566895, + "learning_rate": 1.2138224277046089e-05, + "loss": 0.8553, + "step": 1183 + }, + { + "epoch": 0.4338585562477098, + "grad_norm": 1.7545138597488403, + "learning_rate": 1.2126935851147987e-05, + "loss": 0.8333, + "step": 1184 + }, + { + "epoch": 0.43422499083913524, + "grad_norm": 1.6373624801635742, + "learning_rate": 1.2115644585790419e-05, + "loss": 0.9219, + "step": 1185 + }, + { + "epoch": 0.4345914254305606, + "grad_norm": 1.748104453086853, + "learning_rate": 1.2104350496047221e-05, + "loss": 0.8288, + "step": 1186 + }, + { + "epoch": 0.43495786002198605, + "grad_norm": 1.9308727979660034, + "learning_rate": 1.2093053596996005e-05, + "loss": 0.8304, + "step": 1187 + }, + { + "epoch": 0.4353242946134115, + "grad_norm": 1.7878676652908325, + "learning_rate": 1.2081753903718137e-05, + "loss": 0.8218, + "step": 1188 + }, + { + "epoch": 0.4356907292048369, + "grad_norm": 1.71224045753479, + "learning_rate": 1.2070451431298702e-05, + "loss": 0.875, + "step": 1189 + }, + { + "epoch": 0.43605716379626236, + "grad_norm": 1.8005670309066772, + "learning_rate": 1.2059146194826503e-05, + "loss": 0.8343, + "step": 1190 + }, + { + "epoch": 0.4364235983876878, + "grad_norm": 1.8443491458892822, + "learning_rate": 1.204783820939403e-05, + "loss": 0.7996, + "step": 1191 + }, + { + "epoch": 0.43679003297911323, + "grad_norm": 2.1138112545013428, + "learning_rate": 1.2036527490097445e-05, + "loss": 0.805, + "step": 1192 + }, + { + "epoch": 0.43715646757053866, + "grad_norm": 1.7158002853393555, + "learning_rate": 1.202521405203656e-05, + "loss": 0.8328, + "step": 1193 + }, + { + "epoch": 0.4375229021619641, + "grad_norm": 1.6394296884536743, + "learning_rate": 1.2013897910314816e-05, + "loss": 0.8793, + "step": 1194 + }, + { + "epoch": 0.43788933675338954, + "grad_norm": 1.9240604639053345, + "learning_rate": 1.200257908003926e-05, + "loss": 0.8484, + "step": 1195 + }, + { + "epoch": 0.43825577134481497, + "grad_norm": 1.6958271265029907, + "learning_rate": 1.1991257576320537e-05, + "loss": 0.9014, + "step": 1196 + }, + { + "epoch": 0.4386222059362404, + "grad_norm": 1.7841869592666626, + "learning_rate": 1.1979933414272844e-05, + "loss": 0.8969, + "step": 1197 + }, + { + "epoch": 0.4389886405276658, + "grad_norm": 1.571500539779663, + "learning_rate": 1.1968606609013949e-05, + "loss": 0.8512, + "step": 1198 + }, + { + "epoch": 0.4393550751190912, + "grad_norm": 1.7198463678359985, + "learning_rate": 1.195727717566513e-05, + "loss": 0.8946, + "step": 1199 + }, + { + "epoch": 0.43972150971051666, + "grad_norm": 1.7086559534072876, + "learning_rate": 1.1945945129351184e-05, + "loss": 0.8485, + "step": 1200 + }, + { + "epoch": 0.4400879443019421, + "grad_norm": 1.6819955110549927, + "learning_rate": 1.1934610485200398e-05, + "loss": 0.8898, + "step": 1201 + }, + { + "epoch": 0.4404543788933675, + "grad_norm": 1.8454056978225708, + "learning_rate": 1.1923273258344513e-05, + "loss": 0.8706, + "step": 1202 + }, + { + "epoch": 0.44082081348479296, + "grad_norm": 1.705040693283081, + "learning_rate": 1.1911933463918734e-05, + "loss": 0.8426, + "step": 1203 + }, + { + "epoch": 0.4411872480762184, + "grad_norm": 1.9316009283065796, + "learning_rate": 1.1900591117061684e-05, + "loss": 0.7853, + "step": 1204 + }, + { + "epoch": 0.44155368266764383, + "grad_norm": 1.7013500928878784, + "learning_rate": 1.1889246232915399e-05, + "loss": 0.7973, + "step": 1205 + }, + { + "epoch": 0.44192011725906927, + "grad_norm": 1.675676941871643, + "learning_rate": 1.1877898826625298e-05, + "loss": 0.8407, + "step": 1206 + }, + { + "epoch": 0.4422865518504947, + "grad_norm": 1.847190499305725, + "learning_rate": 1.1866548913340174e-05, + "loss": 0.8683, + "step": 1207 + }, + { + "epoch": 0.44265298644192014, + "grad_norm": 2.021604537963867, + "learning_rate": 1.1855196508212157e-05, + "loss": 0.8217, + "step": 1208 + }, + { + "epoch": 0.4430194210333456, + "grad_norm": 1.7539337873458862, + "learning_rate": 1.1843841626396705e-05, + "loss": 0.8669, + "step": 1209 + }, + { + "epoch": 0.44338585562477095, + "grad_norm": 1.8103632926940918, + "learning_rate": 1.1832484283052592e-05, + "loss": 0.9176, + "step": 1210 + }, + { + "epoch": 0.4437522902161964, + "grad_norm": 1.998170256614685, + "learning_rate": 1.1821124493341868e-05, + "loss": 0.9539, + "step": 1211 + }, + { + "epoch": 0.4441187248076218, + "grad_norm": 1.938361644744873, + "learning_rate": 1.180976227242986e-05, + "loss": 0.7924, + "step": 1212 + }, + { + "epoch": 0.44448515939904726, + "grad_norm": 1.9992772340774536, + "learning_rate": 1.1798397635485124e-05, + "loss": 0.9023, + "step": 1213 + }, + { + "epoch": 0.4448515939904727, + "grad_norm": 1.6736135482788086, + "learning_rate": 1.1787030597679456e-05, + "loss": 0.9319, + "step": 1214 + }, + { + "epoch": 0.44521802858189813, + "grad_norm": 1.6292542219161987, + "learning_rate": 1.1775661174187851e-05, + "loss": 0.8363, + "step": 1215 + }, + { + "epoch": 0.44558446317332356, + "grad_norm": 1.8140482902526855, + "learning_rate": 1.1764289380188492e-05, + "loss": 0.8774, + "step": 1216 + }, + { + "epoch": 0.445950897764749, + "grad_norm": 1.6420940160751343, + "learning_rate": 1.1752915230862718e-05, + "loss": 0.8684, + "step": 1217 + }, + { + "epoch": 0.44631733235617443, + "grad_norm": 1.891953945159912, + "learning_rate": 1.1741538741395028e-05, + "loss": 0.8682, + "step": 1218 + }, + { + "epoch": 0.44668376694759987, + "grad_norm": 1.9159663915634155, + "learning_rate": 1.173015992697303e-05, + "loss": 0.8118, + "step": 1219 + }, + { + "epoch": 0.4470502015390253, + "grad_norm": 1.684457778930664, + "learning_rate": 1.1718778802787443e-05, + "loss": 0.8647, + "step": 1220 + }, + { + "epoch": 0.44741663613045074, + "grad_norm": 1.9485814571380615, + "learning_rate": 1.170739538403207e-05, + "loss": 0.8042, + "step": 1221 + }, + { + "epoch": 0.4477830707218761, + "grad_norm": 1.6726328134536743, + "learning_rate": 1.1696009685903773e-05, + "loss": 0.8763, + "step": 1222 + }, + { + "epoch": 0.44814950531330155, + "grad_norm": 1.5836412906646729, + "learning_rate": 1.168462172360246e-05, + "loss": 0.882, + "step": 1223 + }, + { + "epoch": 0.448515939904727, + "grad_norm": 1.5830448865890503, + "learning_rate": 1.1673231512331065e-05, + "loss": 0.8298, + "step": 1224 + }, + { + "epoch": 0.4488823744961524, + "grad_norm": 1.7107692956924438, + "learning_rate": 1.1661839067295515e-05, + "loss": 0.8391, + "step": 1225 + }, + { + "epoch": 0.44924880908757786, + "grad_norm": 1.7715849876403809, + "learning_rate": 1.1650444403704729e-05, + "loss": 0.8519, + "step": 1226 + }, + { + "epoch": 0.4496152436790033, + "grad_norm": 2.0211827754974365, + "learning_rate": 1.1639047536770581e-05, + "loss": 0.7753, + "step": 1227 + }, + { + "epoch": 0.44998167827042873, + "grad_norm": 1.7676541805267334, + "learning_rate": 1.162764848170789e-05, + "loss": 0.8719, + "step": 1228 + }, + { + "epoch": 0.45034811286185417, + "grad_norm": 1.7997002601623535, + "learning_rate": 1.1616247253734394e-05, + "loss": 0.8708, + "step": 1229 + }, + { + "epoch": 0.4507145474532796, + "grad_norm": 1.5801912546157837, + "learning_rate": 1.1604843868070737e-05, + "loss": 0.8994, + "step": 1230 + }, + { + "epoch": 0.45108098204470504, + "grad_norm": 1.8483695983886719, + "learning_rate": 1.1593438339940433e-05, + "loss": 0.8573, + "step": 1231 + }, + { + "epoch": 0.45144741663613047, + "grad_norm": 1.8525443077087402, + "learning_rate": 1.1582030684569867e-05, + "loss": 0.8523, + "step": 1232 + }, + { + "epoch": 0.4518138512275559, + "grad_norm": 2.0187089443206787, + "learning_rate": 1.1570620917188258e-05, + "loss": 0.782, + "step": 1233 + }, + { + "epoch": 0.4521802858189813, + "grad_norm": 1.7200089693069458, + "learning_rate": 1.1559209053027646e-05, + "loss": 0.8386, + "step": 1234 + }, + { + "epoch": 0.4525467204104067, + "grad_norm": 1.782767653465271, + "learning_rate": 1.1547795107322872e-05, + "loss": 0.8477, + "step": 1235 + }, + { + "epoch": 0.45291315500183216, + "grad_norm": 1.6392277479171753, + "learning_rate": 1.1536379095311552e-05, + "loss": 0.8898, + "step": 1236 + }, + { + "epoch": 0.4532795895932576, + "grad_norm": 1.6319552659988403, + "learning_rate": 1.1524961032234063e-05, + "loss": 0.8794, + "step": 1237 + }, + { + "epoch": 0.453646024184683, + "grad_norm": 1.7427445650100708, + "learning_rate": 1.1513540933333518e-05, + "loss": 0.8871, + "step": 1238 + }, + { + "epoch": 0.45401245877610846, + "grad_norm": 1.8319940567016602, + "learning_rate": 1.1502118813855756e-05, + "loss": 0.919, + "step": 1239 + }, + { + "epoch": 0.4543788933675339, + "grad_norm": 2.074718713760376, + "learning_rate": 1.14906946890493e-05, + "loss": 0.8275, + "step": 1240 + }, + { + "epoch": 0.45474532795895933, + "grad_norm": 1.9303303956985474, + "learning_rate": 1.147926857416536e-05, + "loss": 0.8594, + "step": 1241 + }, + { + "epoch": 0.45511176255038477, + "grad_norm": 1.7829499244689941, + "learning_rate": 1.1467840484457802e-05, + "loss": 0.8238, + "step": 1242 + }, + { + "epoch": 0.4554781971418102, + "grad_norm": 1.978939175605774, + "learning_rate": 1.1456410435183121e-05, + "loss": 0.8077, + "step": 1243 + }, + { + "epoch": 0.45584463173323564, + "grad_norm": 1.8652037382125854, + "learning_rate": 1.1444978441600437e-05, + "loss": 0.7969, + "step": 1244 + }, + { + "epoch": 0.4562110663246611, + "grad_norm": 1.90079927444458, + "learning_rate": 1.1433544518971463e-05, + "loss": 0.8939, + "step": 1245 + }, + { + "epoch": 0.45657750091608645, + "grad_norm": 1.7367563247680664, + "learning_rate": 1.1422108682560484e-05, + "loss": 0.9262, + "step": 1246 + }, + { + "epoch": 0.4569439355075119, + "grad_norm": 1.567033052444458, + "learning_rate": 1.1410670947634344e-05, + "loss": 0.8592, + "step": 1247 + }, + { + "epoch": 0.4573103700989373, + "grad_norm": 1.6424545049667358, + "learning_rate": 1.1399231329462417e-05, + "loss": 0.8162, + "step": 1248 + }, + { + "epoch": 0.45767680469036276, + "grad_norm": 1.5910917520523071, + "learning_rate": 1.1387789843316595e-05, + "loss": 0.5761, + "step": 1249 + }, + { + "epoch": 0.4580432392817882, + "grad_norm": 1.6359686851501465, + "learning_rate": 1.1376346504471265e-05, + "loss": 0.8552, + "step": 1250 + }, + { + "epoch": 0.45840967387321363, + "grad_norm": 1.6789294481277466, + "learning_rate": 1.1364901328203281e-05, + "loss": 0.8821, + "step": 1251 + }, + { + "epoch": 0.45877610846463907, + "grad_norm": 1.849265694618225, + "learning_rate": 1.135345432979196e-05, + "loss": 0.8076, + "step": 1252 + }, + { + "epoch": 0.4591425430560645, + "grad_norm": 1.6932215690612793, + "learning_rate": 1.1342005524519038e-05, + "loss": 0.8663, + "step": 1253 + }, + { + "epoch": 0.45950897764748994, + "grad_norm": 1.7442898750305176, + "learning_rate": 1.1330554927668673e-05, + "loss": 0.8856, + "step": 1254 + }, + { + "epoch": 0.45987541223891537, + "grad_norm": 1.8422902822494507, + "learning_rate": 1.1319102554527417e-05, + "loss": 0.8571, + "step": 1255 + }, + { + "epoch": 0.4602418468303408, + "grad_norm": 1.964569330215454, + "learning_rate": 1.1307648420384181e-05, + "loss": 0.8005, + "step": 1256 + }, + { + "epoch": 0.46060828142176624, + "grad_norm": 1.689928650856018, + "learning_rate": 1.129619254053024e-05, + "loss": 0.792, + "step": 1257 + }, + { + "epoch": 0.4609747160131916, + "grad_norm": 1.8332197666168213, + "learning_rate": 1.1284734930259196e-05, + "loss": 0.812, + "step": 1258 + }, + { + "epoch": 0.46134115060461706, + "grad_norm": 1.7506823539733887, + "learning_rate": 1.1273275604866954e-05, + "loss": 0.8223, + "step": 1259 + }, + { + "epoch": 0.4617075851960425, + "grad_norm": 2.092235803604126, + "learning_rate": 1.1261814579651715e-05, + "loss": 0.7873, + "step": 1260 + }, + { + "epoch": 0.4620740197874679, + "grad_norm": 1.8410356044769287, + "learning_rate": 1.1250351869913949e-05, + "loss": 0.8787, + "step": 1261 + }, + { + "epoch": 0.46244045437889336, + "grad_norm": 1.8117483854293823, + "learning_rate": 1.1238887490956374e-05, + "loss": 0.8607, + "step": 1262 + }, + { + "epoch": 0.4628068889703188, + "grad_norm": 2.216958522796631, + "learning_rate": 1.122742145808394e-05, + "loss": 0.7297, + "step": 1263 + }, + { + "epoch": 0.46317332356174423, + "grad_norm": 1.816941738128662, + "learning_rate": 1.1215953786603798e-05, + "loss": 0.8391, + "step": 1264 + }, + { + "epoch": 0.46353975815316967, + "grad_norm": 1.7820223569869995, + "learning_rate": 1.1204484491825289e-05, + "loss": 0.852, + "step": 1265 + }, + { + "epoch": 0.4639061927445951, + "grad_norm": 1.9520539045333862, + "learning_rate": 1.1193013589059922e-05, + "loss": 0.8399, + "step": 1266 + }, + { + "epoch": 0.46427262733602054, + "grad_norm": 1.8202781677246094, + "learning_rate": 1.1181541093621356e-05, + "loss": 0.8408, + "step": 1267 + }, + { + "epoch": 0.464639061927446, + "grad_norm": 1.7640472650527954, + "learning_rate": 1.117006702082537e-05, + "loss": 0.8051, + "step": 1268 + }, + { + "epoch": 0.4650054965188714, + "grad_norm": 1.6843233108520508, + "learning_rate": 1.1158591385989853e-05, + "loss": 0.8673, + "step": 1269 + }, + { + "epoch": 0.4653719311102968, + "grad_norm": 1.7685338258743286, + "learning_rate": 1.1147114204434782e-05, + "loss": 0.8184, + "step": 1270 + }, + { + "epoch": 0.4657383657017222, + "grad_norm": 2.0041980743408203, + "learning_rate": 1.1135635491482187e-05, + "loss": 0.7857, + "step": 1271 + }, + { + "epoch": 0.46610480029314766, + "grad_norm": 1.8624567985534668, + "learning_rate": 1.1124155262456153e-05, + "loss": 0.8569, + "step": 1272 + }, + { + "epoch": 0.4664712348845731, + "grad_norm": 1.9397152662277222, + "learning_rate": 1.1112673532682792e-05, + "loss": 0.9033, + "step": 1273 + }, + { + "epoch": 0.46683766947599853, + "grad_norm": 1.7223361730575562, + "learning_rate": 1.1101190317490204e-05, + "loss": 0.8823, + "step": 1274 + }, + { + "epoch": 0.46720410406742396, + "grad_norm": 1.7353929281234741, + "learning_rate": 1.1089705632208492e-05, + "loss": 0.8342, + "step": 1275 + }, + { + "epoch": 0.4675705386588494, + "grad_norm": 2.029233455657959, + "learning_rate": 1.1078219492169706e-05, + "loss": 0.8752, + "step": 1276 + }, + { + "epoch": 0.46793697325027483, + "grad_norm": 1.9964361190795898, + "learning_rate": 1.1066731912707842e-05, + "loss": 0.8093, + "step": 1277 + }, + { + "epoch": 0.46830340784170027, + "grad_norm": 1.8332338333129883, + "learning_rate": 1.1055242909158825e-05, + "loss": 0.8543, + "step": 1278 + }, + { + "epoch": 0.4686698424331257, + "grad_norm": 2.361818790435791, + "learning_rate": 1.104375249686047e-05, + "loss": 0.8291, + "step": 1279 + }, + { + "epoch": 0.46903627702455114, + "grad_norm": 1.7925989627838135, + "learning_rate": 1.1032260691152476e-05, + "loss": 0.9158, + "step": 1280 + }, + { + "epoch": 0.4694027116159766, + "grad_norm": 1.713956356048584, + "learning_rate": 1.1020767507376416e-05, + "loss": 0.8466, + "step": 1281 + }, + { + "epoch": 0.46976914620740196, + "grad_norm": 1.8565016984939575, + "learning_rate": 1.1009272960875676e-05, + "loss": 0.8416, + "step": 1282 + }, + { + "epoch": 0.4701355807988274, + "grad_norm": 1.8900073766708374, + "learning_rate": 1.0997777066995485e-05, + "loss": 0.9124, + "step": 1283 + }, + { + "epoch": 0.4705020153902528, + "grad_norm": 1.6251051425933838, + "learning_rate": 1.098627984108286e-05, + "loss": 0.8101, + "step": 1284 + }, + { + "epoch": 0.47086844998167826, + "grad_norm": 1.70287024974823, + "learning_rate": 1.0974781298486596e-05, + "loss": 0.8346, + "step": 1285 + }, + { + "epoch": 0.4712348845731037, + "grad_norm": 1.5804128646850586, + "learning_rate": 1.0963281454557253e-05, + "loss": 0.8686, + "step": 1286 + }, + { + "epoch": 0.47160131916452913, + "grad_norm": 1.6563670635223389, + "learning_rate": 1.0951780324647121e-05, + "loss": 0.8963, + "step": 1287 + }, + { + "epoch": 0.47196775375595457, + "grad_norm": 1.984560489654541, + "learning_rate": 1.0940277924110206e-05, + "loss": 0.8542, + "step": 1288 + }, + { + "epoch": 0.47233418834738, + "grad_norm": 1.7603366374969482, + "learning_rate": 1.0928774268302216e-05, + "loss": 0.7671, + "step": 1289 + }, + { + "epoch": 0.47270062293880544, + "grad_norm": 1.6885719299316406, + "learning_rate": 1.091726937258053e-05, + "loss": 0.9038, + "step": 1290 + }, + { + "epoch": 0.47306705753023087, + "grad_norm": 1.794811487197876, + "learning_rate": 1.090576325230419e-05, + "loss": 0.8681, + "step": 1291 + }, + { + "epoch": 0.4734334921216563, + "grad_norm": 1.6549447774887085, + "learning_rate": 1.0894255922833858e-05, + "loss": 0.8645, + "step": 1292 + }, + { + "epoch": 0.4737999267130817, + "grad_norm": 1.6765258312225342, + "learning_rate": 1.0882747399531828e-05, + "loss": 0.8511, + "step": 1293 + }, + { + "epoch": 0.4741663613045071, + "grad_norm": 1.7557064294815063, + "learning_rate": 1.0871237697761976e-05, + "loss": 0.8482, + "step": 1294 + }, + { + "epoch": 0.47453279589593256, + "grad_norm": 1.8007601499557495, + "learning_rate": 1.0859726832889752e-05, + "loss": 0.8592, + "step": 1295 + }, + { + "epoch": 0.474899230487358, + "grad_norm": 2.138437271118164, + "learning_rate": 1.0848214820282163e-05, + "loss": 0.8028, + "step": 1296 + }, + { + "epoch": 0.47526566507878343, + "grad_norm": 1.8807333707809448, + "learning_rate": 1.0836701675307748e-05, + "loss": 0.8282, + "step": 1297 + }, + { + "epoch": 0.47563209967020886, + "grad_norm": 1.9707701206207275, + "learning_rate": 1.0825187413336556e-05, + "loss": 0.8882, + "step": 1298 + }, + { + "epoch": 0.4759985342616343, + "grad_norm": 2.054344892501831, + "learning_rate": 1.0813672049740126e-05, + "loss": 0.8822, + "step": 1299 + }, + { + "epoch": 0.47636496885305973, + "grad_norm": 2.028402328491211, + "learning_rate": 1.0802155599891466e-05, + "loss": 0.8012, + "step": 1300 + }, + { + "epoch": 0.47673140344448517, + "grad_norm": 1.7414155006408691, + "learning_rate": 1.0790638079165041e-05, + "loss": 0.9486, + "step": 1301 + }, + { + "epoch": 0.4770978380359106, + "grad_norm": 1.7140246629714966, + "learning_rate": 1.0779119502936743e-05, + "loss": 0.7964, + "step": 1302 + }, + { + "epoch": 0.47746427262733604, + "grad_norm": 1.8553911447525024, + "learning_rate": 1.0767599886583866e-05, + "loss": 0.7817, + "step": 1303 + }, + { + "epoch": 0.4778307072187615, + "grad_norm": 1.6952173709869385, + "learning_rate": 1.075607924548511e-05, + "loss": 0.7962, + "step": 1304 + }, + { + "epoch": 0.47819714181018685, + "grad_norm": 1.79563570022583, + "learning_rate": 1.0744557595020518e-05, + "loss": 0.8404, + "step": 1305 + }, + { + "epoch": 0.4785635764016123, + "grad_norm": 1.712677240371704, + "learning_rate": 1.0733034950571497e-05, + "loss": 0.8422, + "step": 1306 + }, + { + "epoch": 0.4789300109930377, + "grad_norm": 1.8360368013381958, + "learning_rate": 1.0721511327520784e-05, + "loss": 0.8429, + "step": 1307 + }, + { + "epoch": 0.47929644558446316, + "grad_norm": 1.819366216659546, + "learning_rate": 1.0709986741252412e-05, + "loss": 0.826, + "step": 1308 + }, + { + "epoch": 0.4796628801758886, + "grad_norm": 1.797717809677124, + "learning_rate": 1.0698461207151707e-05, + "loss": 0.8412, + "step": 1309 + }, + { + "epoch": 0.48002931476731403, + "grad_norm": 1.9159932136535645, + "learning_rate": 1.0686934740605255e-05, + "loss": 0.9032, + "step": 1310 + }, + { + "epoch": 0.48039574935873947, + "grad_norm": 1.6352150440216064, + "learning_rate": 1.067540735700089e-05, + "loss": 0.8857, + "step": 1311 + }, + { + "epoch": 0.4807621839501649, + "grad_norm": 1.794416904449463, + "learning_rate": 1.0663879071727667e-05, + "loss": 0.8368, + "step": 1312 + }, + { + "epoch": 0.48112861854159034, + "grad_norm": 1.9773752689361572, + "learning_rate": 1.065234990017585e-05, + "loss": 0.951, + "step": 1313 + }, + { + "epoch": 0.48149505313301577, + "grad_norm": 1.5759273767471313, + "learning_rate": 1.0640819857736888e-05, + "loss": 0.8831, + "step": 1314 + }, + { + "epoch": 0.4818614877244412, + "grad_norm": 2.310464859008789, + "learning_rate": 1.062928895980338e-05, + "loss": 0.7709, + "step": 1315 + }, + { + "epoch": 0.48222792231586664, + "grad_norm": 1.967894196510315, + "learning_rate": 1.0617757221769085e-05, + "loss": 0.8223, + "step": 1316 + }, + { + "epoch": 0.482594356907292, + "grad_norm": 1.784820318222046, + "learning_rate": 1.0606224659028866e-05, + "loss": 0.8454, + "step": 1317 + }, + { + "epoch": 0.48296079149871746, + "grad_norm": 2.418149709701538, + "learning_rate": 1.0594691286978694e-05, + "loss": 0.8274, + "step": 1318 + }, + { + "epoch": 0.4833272260901429, + "grad_norm": 2.046618938446045, + "learning_rate": 1.0583157121015624e-05, + "loss": 0.7996, + "step": 1319 + }, + { + "epoch": 0.4836936606815683, + "grad_norm": 1.8235247135162354, + "learning_rate": 1.057162217653777e-05, + "loss": 0.8943, + "step": 1320 + }, + { + "epoch": 0.48406009527299376, + "grad_norm": 1.7685117721557617, + "learning_rate": 1.0560086468944285e-05, + "loss": 0.8375, + "step": 1321 + }, + { + "epoch": 0.4844265298644192, + "grad_norm": 1.8292624950408936, + "learning_rate": 1.0548550013635332e-05, + "loss": 0.8761, + "step": 1322 + }, + { + "epoch": 0.48479296445584463, + "grad_norm": 1.636870265007019, + "learning_rate": 1.0537012826012085e-05, + "loss": 0.8491, + "step": 1323 + }, + { + "epoch": 0.48515939904727007, + "grad_norm": 1.6674764156341553, + "learning_rate": 1.0525474921476686e-05, + "loss": 0.8977, + "step": 1324 + }, + { + "epoch": 0.4855258336386955, + "grad_norm": 1.8313924074172974, + "learning_rate": 1.0513936315432245e-05, + "loss": 0.8016, + "step": 1325 + }, + { + "epoch": 0.48589226823012094, + "grad_norm": 2.065986156463623, + "learning_rate": 1.0502397023282792e-05, + "loss": 0.8052, + "step": 1326 + }, + { + "epoch": 0.4862587028215464, + "grad_norm": 1.9295268058776855, + "learning_rate": 1.0490857060433292e-05, + "loss": 0.9053, + "step": 1327 + }, + { + "epoch": 0.4866251374129718, + "grad_norm": 2.224064588546753, + "learning_rate": 1.0479316442289594e-05, + "loss": 0.8122, + "step": 1328 + }, + { + "epoch": 0.4869915720043972, + "grad_norm": 1.6870437860488892, + "learning_rate": 1.0467775184258418e-05, + "loss": 0.8663, + "step": 1329 + }, + { + "epoch": 0.4873580065958226, + "grad_norm": 1.8607361316680908, + "learning_rate": 1.0456233301747348e-05, + "loss": 0.8146, + "step": 1330 + }, + { + "epoch": 0.48772444118724806, + "grad_norm": 1.9341589212417603, + "learning_rate": 1.0444690810164798e-05, + "loss": 0.8418, + "step": 1331 + }, + { + "epoch": 0.4880908757786735, + "grad_norm": 1.6947089433670044, + "learning_rate": 1.0433147724919994e-05, + "loss": 0.851, + "step": 1332 + }, + { + "epoch": 0.48845731037009893, + "grad_norm": 1.8106732368469238, + "learning_rate": 1.0421604061422956e-05, + "loss": 0.8499, + "step": 1333 + }, + { + "epoch": 0.48882374496152436, + "grad_norm": 2.0022175312042236, + "learning_rate": 1.0410059835084473e-05, + "loss": 0.8259, + "step": 1334 + }, + { + "epoch": 0.4891901795529498, + "grad_norm": 1.6601237058639526, + "learning_rate": 1.0398515061316095e-05, + "loss": 0.8095, + "step": 1335 + }, + { + "epoch": 0.48955661414437524, + "grad_norm": 1.8434045314788818, + "learning_rate": 1.0386969755530085e-05, + "loss": 0.8839, + "step": 1336 + }, + { + "epoch": 0.48992304873580067, + "grad_norm": 2.060389518737793, + "learning_rate": 1.0375423933139432e-05, + "loss": 0.8123, + "step": 1337 + }, + { + "epoch": 0.4902894833272261, + "grad_norm": 1.8199058771133423, + "learning_rate": 1.0363877609557812e-05, + "loss": 0.8199, + "step": 1338 + }, + { + "epoch": 0.49065591791865154, + "grad_norm": 2.1421399116516113, + "learning_rate": 1.0352330800199567e-05, + "loss": 0.7497, + "step": 1339 + }, + { + "epoch": 0.491022352510077, + "grad_norm": 2.0703132152557373, + "learning_rate": 1.0340783520479685e-05, + "loss": 0.8741, + "step": 1340 + }, + { + "epoch": 0.49138878710150236, + "grad_norm": 1.9738215208053589, + "learning_rate": 1.0329235785813783e-05, + "loss": 0.89, + "step": 1341 + }, + { + "epoch": 0.4917552216929278, + "grad_norm": 1.8550047874450684, + "learning_rate": 1.031768761161809e-05, + "loss": 0.8284, + "step": 1342 + }, + { + "epoch": 0.4921216562843532, + "grad_norm": 1.751159429550171, + "learning_rate": 1.0306139013309425e-05, + "loss": 0.8669, + "step": 1343 + }, + { + "epoch": 0.49248809087577866, + "grad_norm": 1.841949224472046, + "learning_rate": 1.0294590006305158e-05, + "loss": 0.8763, + "step": 1344 + }, + { + "epoch": 0.4928545254672041, + "grad_norm": 1.8933924436569214, + "learning_rate": 1.0283040606023223e-05, + "loss": 0.868, + "step": 1345 + }, + { + "epoch": 0.49322096005862953, + "grad_norm": 1.6940875053405762, + "learning_rate": 1.0271490827882058e-05, + "loss": 0.9048, + "step": 1346 + }, + { + "epoch": 0.49358739465005497, + "grad_norm": 1.802107810974121, + "learning_rate": 1.0259940687300627e-05, + "loss": 0.862, + "step": 1347 + }, + { + "epoch": 0.4939538292414804, + "grad_norm": 1.8945834636688232, + "learning_rate": 1.0248390199698366e-05, + "loss": 0.8206, + "step": 1348 + }, + { + "epoch": 0.49432026383290584, + "grad_norm": 1.7853095531463623, + "learning_rate": 1.0236839380495174e-05, + "loss": 0.8967, + "step": 1349 + }, + { + "epoch": 0.4946866984243313, + "grad_norm": 2.1143691539764404, + "learning_rate": 1.02252882451114e-05, + "loss": 0.839, + "step": 1350 + }, + { + "epoch": 0.4950531330157567, + "grad_norm": 2.1969010829925537, + "learning_rate": 1.0213736808967809e-05, + "loss": 0.7853, + "step": 1351 + }, + { + "epoch": 0.49541956760718214, + "grad_norm": 1.987229347229004, + "learning_rate": 1.0202185087485563e-05, + "loss": 0.8451, + "step": 1352 + }, + { + "epoch": 0.4957860021986075, + "grad_norm": 1.8627501726150513, + "learning_rate": 1.0190633096086217e-05, + "loss": 0.8877, + "step": 1353 + }, + { + "epoch": 0.49615243679003296, + "grad_norm": 1.9795594215393066, + "learning_rate": 1.0179080850191678e-05, + "loss": 0.8398, + "step": 1354 + }, + { + "epoch": 0.4965188713814584, + "grad_norm": 2.168057680130005, + "learning_rate": 1.0167528365224197e-05, + "loss": 0.8705, + "step": 1355 + }, + { + "epoch": 0.49688530597288383, + "grad_norm": 2.0126590728759766, + "learning_rate": 1.0155975656606338e-05, + "loss": 0.8346, + "step": 1356 + }, + { + "epoch": 0.49725174056430926, + "grad_norm": 1.975282073020935, + "learning_rate": 1.014442273976097e-05, + "loss": 0.863, + "step": 1357 + }, + { + "epoch": 0.4976181751557347, + "grad_norm": 2.144953489303589, + "learning_rate": 1.013286963011124e-05, + "loss": 0.9066, + "step": 1358 + }, + { + "epoch": 0.49798460974716013, + "grad_norm": 1.99014151096344, + "learning_rate": 1.0121316343080542e-05, + "loss": 0.822, + "step": 1359 + }, + { + "epoch": 0.49835104433858557, + "grad_norm": 1.8074718713760376, + "learning_rate": 1.0109762894092521e-05, + "loss": 0.856, + "step": 1360 + }, + { + "epoch": 0.498717478930011, + "grad_norm": 1.2285923957824707, + "learning_rate": 1.0098209298571034e-05, + "loss": 0.5514, + "step": 1361 + }, + { + "epoch": 0.49908391352143644, + "grad_norm": 2.1667258739471436, + "learning_rate": 1.0086655571940127e-05, + "loss": 0.827, + "step": 1362 + }, + { + "epoch": 0.4994503481128619, + "grad_norm": 1.9957808256149292, + "learning_rate": 1.0075101729624025e-05, + "loss": 0.7647, + "step": 1363 + }, + { + "epoch": 0.4998167827042873, + "grad_norm": 1.786232352256775, + "learning_rate": 1.0063547787047114e-05, + "loss": 0.9069, + "step": 1364 + }, + { + "epoch": 0.5001832172957127, + "grad_norm": 1.7317724227905273, + "learning_rate": 1.0051993759633903e-05, + "loss": 0.8322, + "step": 1365 + }, + { + "epoch": 0.5005496518871382, + "grad_norm": 2.1333634853363037, + "learning_rate": 1.004043966280902e-05, + "loss": 0.8836, + "step": 1366 + }, + { + "epoch": 0.5005496518871382, + "eval_loss": 0.8037351369857788, + "eval_runtime": 799.1015, + "eval_samples_per_second": 3.394, + "eval_steps_per_second": 0.424, + "step": 1366 + }, + { + "epoch": 0.5009160864785636, + "grad_norm": 1.9786326885223389, + "learning_rate": 1.0028885511997187e-05, + "loss": 0.8427, + "step": 1367 + }, + { + "epoch": 0.501282521069989, + "grad_norm": 1.7938628196716309, + "learning_rate": 1.0017331322623189e-05, + "loss": 0.8882, + "step": 1368 + }, + { + "epoch": 0.5016489556614144, + "grad_norm": 1.6222481727600098, + "learning_rate": 1.0005777110111878e-05, + "loss": 0.9011, + "step": 1369 + }, + { + "epoch": 0.5020153902528399, + "grad_norm": 1.868241786956787, + "learning_rate": 9.994222889888124e-06, + "loss": 0.8445, + "step": 1370 + }, + { + "epoch": 0.5023818248442653, + "grad_norm": 2.0541722774505615, + "learning_rate": 9.982668677376811e-06, + "loss": 0.8564, + "step": 1371 + }, + { + "epoch": 0.5027482594356907, + "grad_norm": 1.7385966777801514, + "learning_rate": 9.971114488002817e-06, + "loss": 0.8452, + "step": 1372 + }, + { + "epoch": 0.5031146940271162, + "grad_norm": 1.80974280834198, + "learning_rate": 9.959560337190983e-06, + "loss": 0.8711, + "step": 1373 + }, + { + "epoch": 0.5034811286185416, + "grad_norm": 1.864256501197815, + "learning_rate": 9.9480062403661e-06, + "loss": 0.8603, + "step": 1374 + }, + { + "epoch": 0.503847563209967, + "grad_norm": 1.8246670961380005, + "learning_rate": 9.936452212952888e-06, + "loss": 0.8189, + "step": 1375 + }, + { + "epoch": 0.5042139978013924, + "grad_norm": 1.888363003730774, + "learning_rate": 9.924898270375975e-06, + "loss": 0.8572, + "step": 1376 + }, + { + "epoch": 0.5045804323928179, + "grad_norm": 2.028559446334839, + "learning_rate": 9.913344428059876e-06, + "loss": 0.807, + "step": 1377 + }, + { + "epoch": 0.5049468669842433, + "grad_norm": 1.936490535736084, + "learning_rate": 9.90179070142897e-06, + "loss": 0.8288, + "step": 1378 + }, + { + "epoch": 0.5053133015756688, + "grad_norm": 1.8482036590576172, + "learning_rate": 9.890237105907482e-06, + "loss": 0.8411, + "step": 1379 + }, + { + "epoch": 0.5056797361670942, + "grad_norm": 2.113351345062256, + "learning_rate": 9.878683656919461e-06, + "loss": 0.8251, + "step": 1380 + }, + { + "epoch": 0.5060461707585197, + "grad_norm": 2.063748836517334, + "learning_rate": 9.867130369888764e-06, + "loss": 0.8169, + "step": 1381 + }, + { + "epoch": 0.506412605349945, + "grad_norm": 1.8193755149841309, + "learning_rate": 9.855577260239031e-06, + "loss": 0.9078, + "step": 1382 + }, + { + "epoch": 0.5067790399413704, + "grad_norm": 1.823914647102356, + "learning_rate": 9.844024343393665e-06, + "loss": 0.7845, + "step": 1383 + }, + { + "epoch": 0.5071454745327959, + "grad_norm": 1.9707849025726318, + "learning_rate": 9.832471634775806e-06, + "loss": 0.7809, + "step": 1384 + }, + { + "epoch": 0.5075119091242213, + "grad_norm": 1.912359595298767, + "learning_rate": 9.820919149808324e-06, + "loss": 0.8774, + "step": 1385 + }, + { + "epoch": 0.5078783437156468, + "grad_norm": 1.7348723411560059, + "learning_rate": 9.809366903913785e-06, + "loss": 0.8599, + "step": 1386 + }, + { + "epoch": 0.5082447783070722, + "grad_norm": 1.7531144618988037, + "learning_rate": 9.797814912514436e-06, + "loss": 0.8436, + "step": 1387 + }, + { + "epoch": 0.5086112128984976, + "grad_norm": 1.9358574151992798, + "learning_rate": 9.786263191032196e-06, + "loss": 0.8119, + "step": 1388 + }, + { + "epoch": 0.508977647489923, + "grad_norm": 2.0156936645507812, + "learning_rate": 9.774711754888602e-06, + "loss": 0.8819, + "step": 1389 + }, + { + "epoch": 0.5093440820813485, + "grad_norm": 1.9308418035507202, + "learning_rate": 9.763160619504828e-06, + "loss": 0.9012, + "step": 1390 + }, + { + "epoch": 0.5097105166727739, + "grad_norm": 1.8228894472122192, + "learning_rate": 9.751609800301638e-06, + "loss": 0.8609, + "step": 1391 + }, + { + "epoch": 0.5100769512641994, + "grad_norm": 1.7600071430206299, + "learning_rate": 9.740059312699375e-06, + "loss": 0.9132, + "step": 1392 + }, + { + "epoch": 0.5104433858556248, + "grad_norm": 1.6532927751541138, + "learning_rate": 9.728509172117947e-06, + "loss": 0.8989, + "step": 1393 + }, + { + "epoch": 0.5108098204470503, + "grad_norm": 1.9439219236373901, + "learning_rate": 9.716959393976784e-06, + "loss": 0.8466, + "step": 1394 + }, + { + "epoch": 0.5111762550384756, + "grad_norm": 1.8679782152175903, + "learning_rate": 9.705409993694844e-06, + "loss": 0.8169, + "step": 1395 + }, + { + "epoch": 0.511542689629901, + "grad_norm": 2.1819117069244385, + "learning_rate": 9.693860986690579e-06, + "loss": 0.7761, + "step": 1396 + }, + { + "epoch": 0.5119091242213265, + "grad_norm": 2.5734708309173584, + "learning_rate": 9.68231238838191e-06, + "loss": 0.7716, + "step": 1397 + }, + { + "epoch": 0.5122755588127519, + "grad_norm": 1.8313605785369873, + "learning_rate": 9.670764214186222e-06, + "loss": 0.8782, + "step": 1398 + }, + { + "epoch": 0.5126419934041774, + "grad_norm": 2.187032461166382, + "learning_rate": 9.659216479520321e-06, + "loss": 0.7673, + "step": 1399 + }, + { + "epoch": 0.5130084279956028, + "grad_norm": 2.0457074642181396, + "learning_rate": 9.647669199800437e-06, + "loss": 0.8126, + "step": 1400 + }, + { + "epoch": 0.5133748625870282, + "grad_norm": 1.690920114517212, + "learning_rate": 9.636122390442191e-06, + "loss": 0.8425, + "step": 1401 + }, + { + "epoch": 0.5137412971784536, + "grad_norm": 1.909960150718689, + "learning_rate": 9.62457606686057e-06, + "loss": 0.875, + "step": 1402 + }, + { + "epoch": 0.5141077317698791, + "grad_norm": 2.1119649410247803, + "learning_rate": 9.613030244469917e-06, + "loss": 0.8522, + "step": 1403 + }, + { + "epoch": 0.5144741663613045, + "grad_norm": 2.170766830444336, + "learning_rate": 9.601484938683912e-06, + "loss": 0.7812, + "step": 1404 + }, + { + "epoch": 0.51484060095273, + "grad_norm": 1.9513856172561646, + "learning_rate": 9.589940164915528e-06, + "loss": 0.8697, + "step": 1405 + }, + { + "epoch": 0.5152070355441554, + "grad_norm": 1.9524122476577759, + "learning_rate": 9.578395938577047e-06, + "loss": 0.8508, + "step": 1406 + }, + { + "epoch": 0.5155734701355807, + "grad_norm": 2.066345691680908, + "learning_rate": 9.566852275080008e-06, + "loss": 0.8639, + "step": 1407 + }, + { + "epoch": 0.5159399047270062, + "grad_norm": 1.8953510522842407, + "learning_rate": 9.555309189835204e-06, + "loss": 0.8548, + "step": 1408 + }, + { + "epoch": 0.5163063393184316, + "grad_norm": 1.7897011041641235, + "learning_rate": 9.543766698252657e-06, + "loss": 0.8487, + "step": 1409 + }, + { + "epoch": 0.5166727739098571, + "grad_norm": 1.8420007228851318, + "learning_rate": 9.532224815741586e-06, + "loss": 0.8521, + "step": 1410 + }, + { + "epoch": 0.5170392085012825, + "grad_norm": 2.073654890060425, + "learning_rate": 9.52068355771041e-06, + "loss": 0.7481, + "step": 1411 + }, + { + "epoch": 0.517405643092708, + "grad_norm": 1.80082106590271, + "learning_rate": 9.50914293956671e-06, + "loss": 0.8771, + "step": 1412 + }, + { + "epoch": 0.5177720776841334, + "grad_norm": 1.9965510368347168, + "learning_rate": 9.497602976717207e-06, + "loss": 0.8244, + "step": 1413 + }, + { + "epoch": 0.5181385122755588, + "grad_norm": 1.875783920288086, + "learning_rate": 9.486063684567762e-06, + "loss": 0.811, + "step": 1414 + }, + { + "epoch": 0.5185049468669842, + "grad_norm": 1.7989037036895752, + "learning_rate": 9.474525078523317e-06, + "loss": 0.8373, + "step": 1415 + }, + { + "epoch": 0.5188713814584097, + "grad_norm": 1.6763300895690918, + "learning_rate": 9.462987173987918e-06, + "loss": 0.8654, + "step": 1416 + }, + { + "epoch": 0.5192378160498351, + "grad_norm": 1.7529571056365967, + "learning_rate": 9.451449986364671e-06, + "loss": 0.8432, + "step": 1417 + }, + { + "epoch": 0.5196042506412606, + "grad_norm": 2.070521116256714, + "learning_rate": 9.439913531055719e-06, + "loss": 0.8319, + "step": 1418 + }, + { + "epoch": 0.519970685232686, + "grad_norm": 2.0340991020202637, + "learning_rate": 9.428377823462233e-06, + "loss": 0.8814, + "step": 1419 + }, + { + "epoch": 0.5203371198241113, + "grad_norm": 1.8266870975494385, + "learning_rate": 9.41684287898438e-06, + "loss": 0.8147, + "step": 1420 + }, + { + "epoch": 0.5207035544155368, + "grad_norm": 1.816367506980896, + "learning_rate": 9.40530871302131e-06, + "loss": 0.8683, + "step": 1421 + }, + { + "epoch": 0.5210699890069622, + "grad_norm": 1.8408104181289673, + "learning_rate": 9.393775340971139e-06, + "loss": 0.7732, + "step": 1422 + }, + { + "epoch": 0.5214364235983877, + "grad_norm": 1.931918978691101, + "learning_rate": 9.382242778230917e-06, + "loss": 0.8475, + "step": 1423 + }, + { + "epoch": 0.5218028581898131, + "grad_norm": 2.0313661098480225, + "learning_rate": 9.370711040196621e-06, + "loss": 0.8688, + "step": 1424 + }, + { + "epoch": 0.5221692927812386, + "grad_norm": 1.982347846031189, + "learning_rate": 9.359180142263116e-06, + "loss": 0.8364, + "step": 1425 + }, + { + "epoch": 0.522535727372664, + "grad_norm": 1.9787847995758057, + "learning_rate": 9.347650099824151e-06, + "loss": 0.9508, + "step": 1426 + }, + { + "epoch": 0.5229021619640895, + "grad_norm": 1.7552639245986938, + "learning_rate": 9.336120928272335e-06, + "loss": 0.842, + "step": 1427 + }, + { + "epoch": 0.5232685965555148, + "grad_norm": 1.8307520151138306, + "learning_rate": 9.324592642999112e-06, + "loss": 0.8447, + "step": 1428 + }, + { + "epoch": 0.5236350311469403, + "grad_norm": 2.5280072689056396, + "learning_rate": 9.31306525939475e-06, + "loss": 0.7423, + "step": 1429 + }, + { + "epoch": 0.5240014657383657, + "grad_norm": 2.190800189971924, + "learning_rate": 9.301538792848297e-06, + "loss": 0.8356, + "step": 1430 + }, + { + "epoch": 0.5243679003297911, + "grad_norm": 1.8437484502792358, + "learning_rate": 9.290013258747591e-06, + "loss": 0.8997, + "step": 1431 + }, + { + "epoch": 0.5247343349212166, + "grad_norm": 1.7690744400024414, + "learning_rate": 9.278488672479218e-06, + "loss": 0.8618, + "step": 1432 + }, + { + "epoch": 0.525100769512642, + "grad_norm": 2.0354177951812744, + "learning_rate": 9.266965049428503e-06, + "loss": 0.8644, + "step": 1433 + }, + { + "epoch": 0.5254672041040674, + "grad_norm": 1.8663829565048218, + "learning_rate": 9.255442404979484e-06, + "loss": 0.8965, + "step": 1434 + }, + { + "epoch": 0.5258336386954928, + "grad_norm": 1.9830769300460815, + "learning_rate": 9.243920754514895e-06, + "loss": 0.8205, + "step": 1435 + }, + { + "epoch": 0.5262000732869183, + "grad_norm": 1.9793612957000732, + "learning_rate": 9.232400113416136e-06, + "loss": 0.776, + "step": 1436 + }, + { + "epoch": 0.5265665078783437, + "grad_norm": 1.7279483079910278, + "learning_rate": 9.22088049706326e-06, + "loss": 0.8636, + "step": 1437 + }, + { + "epoch": 0.5269329424697692, + "grad_norm": 1.8364406824111938, + "learning_rate": 9.20936192083496e-06, + "loss": 0.8017, + "step": 1438 + }, + { + "epoch": 0.5272993770611946, + "grad_norm": 1.7913066148757935, + "learning_rate": 9.197844400108536e-06, + "loss": 0.8637, + "step": 1439 + }, + { + "epoch": 0.52766581165262, + "grad_norm": 1.9976123571395874, + "learning_rate": 9.18632795025988e-06, + "loss": 0.9093, + "step": 1440 + }, + { + "epoch": 0.5280322462440454, + "grad_norm": 1.79649817943573, + "learning_rate": 9.174812586663447e-06, + "loss": 0.8709, + "step": 1441 + }, + { + "epoch": 0.5283986808354708, + "grad_norm": 1.9000849723815918, + "learning_rate": 9.163298324692254e-06, + "loss": 0.8728, + "step": 1442 + }, + { + "epoch": 0.5287651154268963, + "grad_norm": 2.158200740814209, + "learning_rate": 9.151785179717838e-06, + "loss": 0.9365, + "step": 1443 + }, + { + "epoch": 0.5291315500183217, + "grad_norm": 2.1834192276000977, + "learning_rate": 9.14027316711025e-06, + "loss": 0.8341, + "step": 1444 + }, + { + "epoch": 0.5294979846097472, + "grad_norm": 2.1154308319091797, + "learning_rate": 9.12876230223803e-06, + "loss": 0.8308, + "step": 1445 + }, + { + "epoch": 0.5298644192011726, + "grad_norm": 2.185143232345581, + "learning_rate": 9.117252600468175e-06, + "loss": 0.8251, + "step": 1446 + }, + { + "epoch": 0.530230853792598, + "grad_norm": 2.063204526901245, + "learning_rate": 9.105744077166145e-06, + "loss": 0.8575, + "step": 1447 + }, + { + "epoch": 0.5305972883840234, + "grad_norm": 1.9822335243225098, + "learning_rate": 9.094236747695814e-06, + "loss": 0.8331, + "step": 1448 + }, + { + "epoch": 0.5309637229754489, + "grad_norm": 2.593322992324829, + "learning_rate": 9.08273062741947e-06, + "loss": 0.7768, + "step": 1449 + }, + { + "epoch": 0.5313301575668743, + "grad_norm": 2.102851390838623, + "learning_rate": 9.071225731697789e-06, + "loss": 0.7953, + "step": 1450 + }, + { + "epoch": 0.5316965921582998, + "grad_norm": 1.9108489751815796, + "learning_rate": 9.059722075889798e-06, + "loss": 0.7976, + "step": 1451 + }, + { + "epoch": 0.5320630267497252, + "grad_norm": 2.1853110790252686, + "learning_rate": 9.048219675352882e-06, + "loss": 0.8989, + "step": 1452 + }, + { + "epoch": 0.5324294613411507, + "grad_norm": 2.1810035705566406, + "learning_rate": 9.036718545442748e-06, + "loss": 0.7938, + "step": 1453 + }, + { + "epoch": 0.532795895932576, + "grad_norm": 1.8937190771102905, + "learning_rate": 9.025218701513404e-06, + "loss": 0.83, + "step": 1454 + }, + { + "epoch": 0.5331623305240014, + "grad_norm": 1.77532160282135, + "learning_rate": 9.013720158917145e-06, + "loss": 0.8331, + "step": 1455 + }, + { + "epoch": 0.5335287651154269, + "grad_norm": 2.0107715129852295, + "learning_rate": 9.002222933004518e-06, + "loss": 0.7664, + "step": 1456 + }, + { + "epoch": 0.5338951997068523, + "grad_norm": 1.7541794776916504, + "learning_rate": 8.990727039124327e-06, + "loss": 0.8028, + "step": 1457 + }, + { + "epoch": 0.5342616342982778, + "grad_norm": 1.9145309925079346, + "learning_rate": 8.979232492623587e-06, + "loss": 0.8315, + "step": 1458 + }, + { + "epoch": 0.5346280688897032, + "grad_norm": 1.8287135362625122, + "learning_rate": 8.967739308847524e-06, + "loss": 0.8707, + "step": 1459 + }, + { + "epoch": 0.5349945034811286, + "grad_norm": 1.9218909740447998, + "learning_rate": 8.956247503139537e-06, + "loss": 0.8504, + "step": 1460 + }, + { + "epoch": 0.535360938072554, + "grad_norm": 2.1773972511291504, + "learning_rate": 8.94475709084118e-06, + "loss": 0.8332, + "step": 1461 + }, + { + "epoch": 0.5357273726639795, + "grad_norm": 2.0021514892578125, + "learning_rate": 8.93326808729216e-06, + "loss": 0.8127, + "step": 1462 + }, + { + "epoch": 0.5360938072554049, + "grad_norm": 1.9204373359680176, + "learning_rate": 8.921780507830296e-06, + "loss": 0.8519, + "step": 1463 + }, + { + "epoch": 0.5364602418468304, + "grad_norm": 2.199204206466675, + "learning_rate": 8.91029436779151e-06, + "loss": 0.7477, + "step": 1464 + }, + { + "epoch": 0.5368266764382558, + "grad_norm": 1.8903037309646606, + "learning_rate": 8.898809682509796e-06, + "loss": 0.8401, + "step": 1465 + }, + { + "epoch": 0.5371931110296811, + "grad_norm": 2.7648026943206787, + "learning_rate": 8.887326467317215e-06, + "loss": 0.8437, + "step": 1466 + }, + { + "epoch": 0.5375595456211066, + "grad_norm": 2.081468105316162, + "learning_rate": 8.87584473754385e-06, + "loss": 0.8562, + "step": 1467 + }, + { + "epoch": 0.537925980212532, + "grad_norm": 2.111710786819458, + "learning_rate": 8.864364508517816e-06, + "loss": 0.8454, + "step": 1468 + }, + { + "epoch": 0.5382924148039575, + "grad_norm": 2.428865432739258, + "learning_rate": 8.852885795565221e-06, + "loss": 0.752, + "step": 1469 + }, + { + "epoch": 0.5386588493953829, + "grad_norm": 2.243191719055176, + "learning_rate": 8.841408614010148e-06, + "loss": 0.8432, + "step": 1470 + }, + { + "epoch": 0.5390252839868084, + "grad_norm": 1.8198479413986206, + "learning_rate": 8.829932979174635e-06, + "loss": 0.8663, + "step": 1471 + }, + { + "epoch": 0.5393917185782338, + "grad_norm": 1.930250883102417, + "learning_rate": 8.81845890637865e-06, + "loss": 0.8288, + "step": 1472 + }, + { + "epoch": 0.5397581531696592, + "grad_norm": 1.8728525638580322, + "learning_rate": 8.80698641094008e-06, + "loss": 0.8411, + "step": 1473 + }, + { + "epoch": 0.5401245877610846, + "grad_norm": 2.136594533920288, + "learning_rate": 8.795515508174713e-06, + "loss": 0.8127, + "step": 1474 + }, + { + "epoch": 0.5404910223525101, + "grad_norm": 1.3855836391448975, + "learning_rate": 8.784046213396206e-06, + "loss": 0.5478, + "step": 1475 + }, + { + "epoch": 0.5408574569439355, + "grad_norm": 1.8245359659194946, + "learning_rate": 8.772578541916063e-06, + "loss": 0.8099, + "step": 1476 + }, + { + "epoch": 0.541223891535361, + "grad_norm": 1.8170462846755981, + "learning_rate": 8.76111250904363e-06, + "loss": 0.8622, + "step": 1477 + }, + { + "epoch": 0.5415903261267864, + "grad_norm": 1.8183109760284424, + "learning_rate": 8.749648130086054e-06, + "loss": 0.8705, + "step": 1478 + }, + { + "epoch": 0.5419567607182117, + "grad_norm": 1.731231689453125, + "learning_rate": 8.738185420348287e-06, + "loss": 0.893, + "step": 1479 + }, + { + "epoch": 0.5423231953096372, + "grad_norm": 1.8436676263809204, + "learning_rate": 8.726724395133048e-06, + "loss": 0.8448, + "step": 1480 + }, + { + "epoch": 0.5426896299010626, + "grad_norm": 2.3991260528564453, + "learning_rate": 8.715265069740809e-06, + "loss": 0.7758, + "step": 1481 + }, + { + "epoch": 0.5430560644924881, + "grad_norm": 1.7913585901260376, + "learning_rate": 8.703807459469763e-06, + "loss": 0.821, + "step": 1482 + }, + { + "epoch": 0.5434224990839135, + "grad_norm": 1.7945986986160278, + "learning_rate": 8.692351579615822e-06, + "loss": 0.8389, + "step": 1483 + }, + { + "epoch": 0.543788933675339, + "grad_norm": 1.5730236768722534, + "learning_rate": 8.680897445472588e-06, + "loss": 0.5625, + "step": 1484 + }, + { + "epoch": 0.5441553682667644, + "grad_norm": 1.9446221590042114, + "learning_rate": 8.669445072331327e-06, + "loss": 0.8495, + "step": 1485 + }, + { + "epoch": 0.5445218028581899, + "grad_norm": 1.8746920824050903, + "learning_rate": 8.657994475480967e-06, + "loss": 0.8506, + "step": 1486 + }, + { + "epoch": 0.5448882374496152, + "grad_norm": 2.2092230319976807, + "learning_rate": 8.646545670208044e-06, + "loss": 0.7538, + "step": 1487 + }, + { + "epoch": 0.5452546720410407, + "grad_norm": 1.8926215171813965, + "learning_rate": 8.63509867179672e-06, + "loss": 0.8054, + "step": 1488 + }, + { + "epoch": 0.5456211066324661, + "grad_norm": 1.9946545362472534, + "learning_rate": 8.623653495528738e-06, + "loss": 0.8634, + "step": 1489 + }, + { + "epoch": 0.5459875412238915, + "grad_norm": 2.426816463470459, + "learning_rate": 8.612210156683407e-06, + "loss": 0.7489, + "step": 1490 + }, + { + "epoch": 0.546353975815317, + "grad_norm": 1.9078896045684814, + "learning_rate": 8.600768670537588e-06, + "loss": 0.835, + "step": 1491 + }, + { + "epoch": 0.5467204104067424, + "grad_norm": 2.1838278770446777, + "learning_rate": 8.58932905236566e-06, + "loss": 0.8453, + "step": 1492 + }, + { + "epoch": 0.5470868449981678, + "grad_norm": 2.0195322036743164, + "learning_rate": 8.577891317439519e-06, + "loss": 0.8305, + "step": 1493 + }, + { + "epoch": 0.5474532795895932, + "grad_norm": 2.025602102279663, + "learning_rate": 8.56645548102854e-06, + "loss": 0.9082, + "step": 1494 + }, + { + "epoch": 0.5478197141810187, + "grad_norm": 2.021904706954956, + "learning_rate": 8.555021558399565e-06, + "loss": 0.8613, + "step": 1495 + }, + { + "epoch": 0.5481861487724441, + "grad_norm": 2.3774404525756836, + "learning_rate": 8.54358956481688e-06, + "loss": 0.7845, + "step": 1496 + }, + { + "epoch": 0.5485525833638696, + "grad_norm": 1.6819612979888916, + "learning_rate": 8.532159515542204e-06, + "loss": 0.8947, + "step": 1497 + }, + { + "epoch": 0.548919017955295, + "grad_norm": 2.2661547660827637, + "learning_rate": 8.520731425834643e-06, + "loss": 0.7927, + "step": 1498 + }, + { + "epoch": 0.5492854525467205, + "grad_norm": 1.8248411417007446, + "learning_rate": 8.509305310950705e-06, + "loss": 0.8244, + "step": 1499 + }, + { + "epoch": 0.5496518871381458, + "grad_norm": 2.1215291023254395, + "learning_rate": 8.497881186144249e-06, + "loss": 0.8209, + "step": 1500 + }, + { + "epoch": 0.5500183217295713, + "grad_norm": 2.1072847843170166, + "learning_rate": 8.48645906666648e-06, + "loss": 0.8291, + "step": 1501 + }, + { + "epoch": 0.5503847563209967, + "grad_norm": 1.918225884437561, + "learning_rate": 8.475038967765942e-06, + "loss": 0.8143, + "step": 1502 + }, + { + "epoch": 0.5507511909124221, + "grad_norm": 2.096271276473999, + "learning_rate": 8.463620904688452e-06, + "loss": 0.8405, + "step": 1503 + }, + { + "epoch": 0.5511176255038476, + "grad_norm": 1.6998379230499268, + "learning_rate": 8.45220489267713e-06, + "loss": 0.8819, + "step": 1504 + }, + { + "epoch": 0.551484060095273, + "grad_norm": 2.25860333442688, + "learning_rate": 8.440790946972357e-06, + "loss": 0.8072, + "step": 1505 + }, + { + "epoch": 0.5518504946866984, + "grad_norm": 2.0350191593170166, + "learning_rate": 8.429379082811743e-06, + "loss": 0.819, + "step": 1506 + }, + { + "epoch": 0.5522169292781238, + "grad_norm": 1.8747131824493408, + "learning_rate": 8.417969315430138e-06, + "loss": 0.8611, + "step": 1507 + }, + { + "epoch": 0.5525833638695493, + "grad_norm": 2.0432024002075195, + "learning_rate": 8.40656166005957e-06, + "loss": 0.8181, + "step": 1508 + }, + { + "epoch": 0.5529497984609747, + "grad_norm": 1.9487144947052002, + "learning_rate": 8.395156131929266e-06, + "loss": 0.8138, + "step": 1509 + }, + { + "epoch": 0.5533162330524002, + "grad_norm": 1.801957130432129, + "learning_rate": 8.383752746265608e-06, + "loss": 0.8819, + "step": 1510 + }, + { + "epoch": 0.5536826676438256, + "grad_norm": 1.736411213874817, + "learning_rate": 8.372351518292112e-06, + "loss": 0.85, + "step": 1511 + }, + { + "epoch": 0.5540491022352511, + "grad_norm": 2.142106056213379, + "learning_rate": 8.360952463229424e-06, + "loss": 0.8645, + "step": 1512 + }, + { + "epoch": 0.5544155368266764, + "grad_norm": 2.392352342605591, + "learning_rate": 8.349555596295276e-06, + "loss": 0.7336, + "step": 1513 + }, + { + "epoch": 0.5547819714181018, + "grad_norm": 1.8606106042861938, + "learning_rate": 8.338160932704487e-06, + "loss": 0.9252, + "step": 1514 + }, + { + "epoch": 0.5551484060095273, + "grad_norm": 2.3040175437927246, + "learning_rate": 8.326768487668937e-06, + "loss": 0.7391, + "step": 1515 + }, + { + "epoch": 0.5555148406009527, + "grad_norm": 2.4299490451812744, + "learning_rate": 8.315378276397542e-06, + "loss": 0.7129, + "step": 1516 + }, + { + "epoch": 0.5558812751923782, + "grad_norm": 1.8836729526519775, + "learning_rate": 8.303990314096234e-06, + "loss": 0.8198, + "step": 1517 + }, + { + "epoch": 0.5562477097838036, + "grad_norm": 1.9984331130981445, + "learning_rate": 8.292604615967935e-06, + "loss": 0.8042, + "step": 1518 + }, + { + "epoch": 0.556614144375229, + "grad_norm": 2.047516345977783, + "learning_rate": 8.28122119721256e-06, + "loss": 0.8574, + "step": 1519 + }, + { + "epoch": 0.5569805789666544, + "grad_norm": 2.2532103061676025, + "learning_rate": 8.269840073026972e-06, + "loss": 0.857, + "step": 1520 + }, + { + "epoch": 0.5573470135580799, + "grad_norm": 2.170428991317749, + "learning_rate": 8.258461258604975e-06, + "loss": 0.8122, + "step": 1521 + }, + { + "epoch": 0.5577134481495053, + "grad_norm": 2.1111834049224854, + "learning_rate": 8.247084769137284e-06, + "loss": 0.7591, + "step": 1522 + }, + { + "epoch": 0.5580798827409308, + "grad_norm": 1.7928640842437744, + "learning_rate": 8.235710619811515e-06, + "loss": 0.879, + "step": 1523 + }, + { + "epoch": 0.5584463173323562, + "grad_norm": 1.917716383934021, + "learning_rate": 8.224338825812152e-06, + "loss": 0.8858, + "step": 1524 + }, + { + "epoch": 0.5588127519237817, + "grad_norm": 2.1861300468444824, + "learning_rate": 8.212969402320546e-06, + "loss": 0.863, + "step": 1525 + }, + { + "epoch": 0.559179186515207, + "grad_norm": 1.7714310884475708, + "learning_rate": 8.201602364514878e-06, + "loss": 0.8418, + "step": 1526 + }, + { + "epoch": 0.5595456211066324, + "grad_norm": 1.9631260633468628, + "learning_rate": 8.190237727570142e-06, + "loss": 0.8734, + "step": 1527 + }, + { + "epoch": 0.5599120556980579, + "grad_norm": 2.40706205368042, + "learning_rate": 8.178875506658133e-06, + "loss": 0.728, + "step": 1528 + }, + { + "epoch": 0.5602784902894833, + "grad_norm": 1.7918330430984497, + "learning_rate": 8.167515716947411e-06, + "loss": 0.8601, + "step": 1529 + }, + { + "epoch": 0.5606449248809088, + "grad_norm": 1.9069151878356934, + "learning_rate": 8.156158373603297e-06, + "loss": 0.8387, + "step": 1530 + }, + { + "epoch": 0.5610113594723342, + "grad_norm": 1.8654823303222656, + "learning_rate": 8.144803491787848e-06, + "loss": 0.884, + "step": 1531 + }, + { + "epoch": 0.5613777940637596, + "grad_norm": 2.0031216144561768, + "learning_rate": 8.13345108665983e-06, + "loss": 0.8334, + "step": 1532 + }, + { + "epoch": 0.561744228655185, + "grad_norm": 2.1736366748809814, + "learning_rate": 8.122101173374704e-06, + "loss": 0.8301, + "step": 1533 + }, + { + "epoch": 0.5621106632466105, + "grad_norm": 2.0205841064453125, + "learning_rate": 8.110753767084604e-06, + "loss": 0.8304, + "step": 1534 + }, + { + "epoch": 0.5624770978380359, + "grad_norm": 1.993735432624817, + "learning_rate": 8.099408882938319e-06, + "loss": 0.8576, + "step": 1535 + }, + { + "epoch": 0.5628435324294614, + "grad_norm": 2.1005938053131104, + "learning_rate": 8.088066536081269e-06, + "loss": 0.809, + "step": 1536 + }, + { + "epoch": 0.5632099670208868, + "grad_norm": 1.8967087268829346, + "learning_rate": 8.076726741655489e-06, + "loss": 0.8422, + "step": 1537 + }, + { + "epoch": 0.5635764016123122, + "grad_norm": 1.860841989517212, + "learning_rate": 8.065389514799609e-06, + "loss": 0.8539, + "step": 1538 + }, + { + "epoch": 0.5639428362037376, + "grad_norm": 2.1330482959747314, + "learning_rate": 8.054054870648818e-06, + "loss": 0.843, + "step": 1539 + }, + { + "epoch": 0.564309270795163, + "grad_norm": 2.035189390182495, + "learning_rate": 8.042722824334874e-06, + "loss": 0.8647, + "step": 1540 + }, + { + "epoch": 0.5646757053865885, + "grad_norm": 1.9112155437469482, + "learning_rate": 8.031393390986055e-06, + "loss": 0.7743, + "step": 1541 + }, + { + "epoch": 0.5650421399780139, + "grad_norm": 2.167698383331299, + "learning_rate": 8.020066585727156e-06, + "loss": 0.8544, + "step": 1542 + }, + { + "epoch": 0.5654085745694394, + "grad_norm": 2.4354660511016846, + "learning_rate": 8.00874242367947e-06, + "loss": 0.8111, + "step": 1543 + }, + { + "epoch": 0.5657750091608648, + "grad_norm": 2.0466530323028564, + "learning_rate": 7.997420919960741e-06, + "loss": 0.8742, + "step": 1544 + }, + { + "epoch": 0.5661414437522903, + "grad_norm": 2.8126063346862793, + "learning_rate": 7.986102089685189e-06, + "loss": 0.7313, + "step": 1545 + }, + { + "epoch": 0.5665078783437156, + "grad_norm": 2.1306872367858887, + "learning_rate": 7.974785947963441e-06, + "loss": 0.8469, + "step": 1546 + }, + { + "epoch": 0.5668743129351411, + "grad_norm": 1.9655848741531372, + "learning_rate": 7.963472509902556e-06, + "loss": 0.862, + "step": 1547 + }, + { + "epoch": 0.5672407475265665, + "grad_norm": 1.8828647136688232, + "learning_rate": 7.952161790605975e-06, + "loss": 0.8262, + "step": 1548 + }, + { + "epoch": 0.567607182117992, + "grad_norm": 1.8155533075332642, + "learning_rate": 7.940853805173503e-06, + "loss": 0.8533, + "step": 1549 + }, + { + "epoch": 0.5679736167094174, + "grad_norm": 2.217395782470703, + "learning_rate": 7.929548568701303e-06, + "loss": 0.7375, + "step": 1550 + }, + { + "epoch": 0.5683400513008428, + "grad_norm": 2.2814624309539795, + "learning_rate": 7.918246096281868e-06, + "loss": 0.7539, + "step": 1551 + }, + { + "epoch": 0.5687064858922682, + "grad_norm": 1.9111372232437134, + "learning_rate": 7.906946403003995e-06, + "loss": 0.8306, + "step": 1552 + }, + { + "epoch": 0.5690729204836936, + "grad_norm": 1.7845666408538818, + "learning_rate": 7.89564950395278e-06, + "loss": 0.7968, + "step": 1553 + }, + { + "epoch": 0.5694393550751191, + "grad_norm": 2.1907219886779785, + "learning_rate": 7.884355414209586e-06, + "loss": 0.8312, + "step": 1554 + }, + { + "epoch": 0.5698057896665445, + "grad_norm": 2.0254149436950684, + "learning_rate": 7.873064148852014e-06, + "loss": 0.857, + "step": 1555 + }, + { + "epoch": 0.57017222425797, + "grad_norm": 2.041749954223633, + "learning_rate": 7.861775722953913e-06, + "loss": 0.9304, + "step": 1556 + }, + { + "epoch": 0.5705386588493954, + "grad_norm": 2.1609628200531006, + "learning_rate": 7.850490151585326e-06, + "loss": 0.8433, + "step": 1557 + }, + { + "epoch": 0.5709050934408209, + "grad_norm": 2.2469706535339355, + "learning_rate": 7.839207449812493e-06, + "loss": 0.7868, + "step": 1558 + }, + { + "epoch": 0.5712715280322462, + "grad_norm": 1.949552059173584, + "learning_rate": 7.827927632697827e-06, + "loss": 0.8477, + "step": 1559 + }, + { + "epoch": 0.5716379626236717, + "grad_norm": 1.771660566329956, + "learning_rate": 7.816650715299873e-06, + "loss": 0.8199, + "step": 1560 + }, + { + "epoch": 0.5720043972150971, + "grad_norm": 1.9107954502105713, + "learning_rate": 7.805376712673324e-06, + "loss": 0.893, + "step": 1561 + }, + { + "epoch": 0.5723708318065225, + "grad_norm": 2.2006566524505615, + "learning_rate": 7.794105639868976e-06, + "loss": 0.8021, + "step": 1562 + }, + { + "epoch": 0.572737266397948, + "grad_norm": 1.9410490989685059, + "learning_rate": 7.782837511933707e-06, + "loss": 0.8815, + "step": 1563 + }, + { + "epoch": 0.5731037009893734, + "grad_norm": 1.9208725690841675, + "learning_rate": 7.771572343910471e-06, + "loss": 0.8115, + "step": 1564 + }, + { + "epoch": 0.5734701355807988, + "grad_norm": 2.0697717666625977, + "learning_rate": 7.760310150838267e-06, + "loss": 0.8229, + "step": 1565 + }, + { + "epoch": 0.5738365701722242, + "grad_norm": 2.2040605545043945, + "learning_rate": 7.749050947752119e-06, + "loss": 0.8481, + "step": 1566 + }, + { + "epoch": 0.5742030047636497, + "grad_norm": 2.007427215576172, + "learning_rate": 7.737794749683074e-06, + "loss": 0.8856, + "step": 1567 + }, + { + "epoch": 0.5745694393550751, + "grad_norm": 2.1302149295806885, + "learning_rate": 7.726541571658145e-06, + "loss": 0.8318, + "step": 1568 + }, + { + "epoch": 0.5749358739465006, + "grad_norm": 1.92087721824646, + "learning_rate": 7.715291428700336e-06, + "loss": 0.8355, + "step": 1569 + }, + { + "epoch": 0.575302308537926, + "grad_norm": 2.1725895404815674, + "learning_rate": 7.704044335828582e-06, + "loss": 0.8164, + "step": 1570 + }, + { + "epoch": 0.5756687431293515, + "grad_norm": 1.9096589088439941, + "learning_rate": 7.69280030805775e-06, + "loss": 0.8481, + "step": 1571 + }, + { + "epoch": 0.5760351777207768, + "grad_norm": 2.3047072887420654, + "learning_rate": 7.681559360398623e-06, + "loss": 0.7868, + "step": 1572 + }, + { + "epoch": 0.5764016123122023, + "grad_norm": 2.029869556427002, + "learning_rate": 7.67032150785787e-06, + "loss": 0.8592, + "step": 1573 + }, + { + "epoch": 0.5767680469036277, + "grad_norm": 2.2390494346618652, + "learning_rate": 7.659086765438017e-06, + "loss": 0.7462, + "step": 1574 + }, + { + "epoch": 0.5771344814950531, + "grad_norm": 2.311746120452881, + "learning_rate": 7.64785514813745e-06, + "loss": 0.8339, + "step": 1575 + }, + { + "epoch": 0.5775009160864786, + "grad_norm": 2.535600185394287, + "learning_rate": 7.636626670950375e-06, + "loss": 0.7523, + "step": 1576 + }, + { + "epoch": 0.577867350677904, + "grad_norm": 2.6063930988311768, + "learning_rate": 7.625401348866812e-06, + "loss": 0.8121, + "step": 1577 + }, + { + "epoch": 0.5782337852693294, + "grad_norm": 2.046186685562134, + "learning_rate": 7.614179196872566e-06, + "loss": 0.8722, + "step": 1578 + }, + { + "epoch": 0.5786002198607548, + "grad_norm": 1.9237748384475708, + "learning_rate": 7.602960229949216e-06, + "loss": 0.8812, + "step": 1579 + }, + { + "epoch": 0.5789666544521803, + "grad_norm": 2.2206146717071533, + "learning_rate": 7.591744463074075e-06, + "loss": 0.7857, + "step": 1580 + }, + { + "epoch": 0.5793330890436057, + "grad_norm": 2.3066954612731934, + "learning_rate": 7.580531911220195e-06, + "loss": 0.8151, + "step": 1581 + }, + { + "epoch": 0.5796995236350312, + "grad_norm": 1.853406310081482, + "learning_rate": 7.5693225893563326e-06, + "loss": 0.8598, + "step": 1582 + }, + { + "epoch": 0.5800659582264566, + "grad_norm": 1.8438457250595093, + "learning_rate": 7.5581165124469355e-06, + "loss": 0.8669, + "step": 1583 + }, + { + "epoch": 0.5804323928178821, + "grad_norm": 1.9874687194824219, + "learning_rate": 7.546913695452118e-06, + "loss": 0.854, + "step": 1584 + }, + { + "epoch": 0.5807988274093074, + "grad_norm": 2.0050768852233887, + "learning_rate": 7.535714153327639e-06, + "loss": 0.7932, + "step": 1585 + }, + { + "epoch": 0.5811652620007328, + "grad_norm": 2.3346457481384277, + "learning_rate": 7.5245179010248885e-06, + "loss": 0.785, + "step": 1586 + }, + { + "epoch": 0.5815316965921583, + "grad_norm": 2.0014426708221436, + "learning_rate": 7.513324953490859e-06, + "loss": 0.8025, + "step": 1587 + }, + { + "epoch": 0.5818981311835837, + "grad_norm": 1.9692394733428955, + "learning_rate": 7.502135325668143e-06, + "loss": 0.8393, + "step": 1588 + }, + { + "epoch": 0.5822645657750092, + "grad_norm": 2.0282275676727295, + "learning_rate": 7.49094903249489e-06, + "loss": 0.8652, + "step": 1589 + }, + { + "epoch": 0.5826310003664346, + "grad_norm": 1.8718082904815674, + "learning_rate": 7.47976608890481e-06, + "loss": 0.8978, + "step": 1590 + }, + { + "epoch": 0.58299743495786, + "grad_norm": 2.1026763916015625, + "learning_rate": 7.468586509827123e-06, + "loss": 0.8509, + "step": 1591 + }, + { + "epoch": 0.5833638695492854, + "grad_norm": 1.8932467699050903, + "learning_rate": 7.457410310186568e-06, + "loss": 0.8365, + "step": 1592 + }, + { + "epoch": 0.5837303041407109, + "grad_norm": 1.9677419662475586, + "learning_rate": 7.446237504903377e-06, + "loss": 0.8376, + "step": 1593 + }, + { + "epoch": 0.5840967387321363, + "grad_norm": 2.058506965637207, + "learning_rate": 7.435068108893244e-06, + "loss": 0.8204, + "step": 1594 + }, + { + "epoch": 0.5844631733235618, + "grad_norm": 2.407780647277832, + "learning_rate": 7.423902137067318e-06, + "loss": 0.7817, + "step": 1595 + }, + { + "epoch": 0.5848296079149872, + "grad_norm": 1.9901889562606812, + "learning_rate": 7.412739604332162e-06, + "loss": 0.8741, + "step": 1596 + }, + { + "epoch": 0.5851960425064126, + "grad_norm": 2.0285966396331787, + "learning_rate": 7.401580525589767e-06, + "loss": 0.8727, + "step": 1597 + }, + { + "epoch": 0.585562477097838, + "grad_norm": 2.058849334716797, + "learning_rate": 7.390424915737495e-06, + "loss": 0.8293, + "step": 1598 + }, + { + "epoch": 0.5859289116892634, + "grad_norm": 1.9227625131607056, + "learning_rate": 7.379272789668093e-06, + "loss": 0.8764, + "step": 1599 + }, + { + "epoch": 0.5862953462806889, + "grad_norm": 2.040282726287842, + "learning_rate": 7.3681241622696496e-06, + "loss": 0.8547, + "step": 1600 + }, + { + "epoch": 0.5866617808721143, + "grad_norm": 2.291186571121216, + "learning_rate": 7.3569790484255776e-06, + "loss": 0.862, + "step": 1601 + }, + { + "epoch": 0.5870282154635398, + "grad_norm": 2.185753583908081, + "learning_rate": 7.345837463014609e-06, + "loss": 0.8083, + "step": 1602 + }, + { + "epoch": 0.5873946500549652, + "grad_norm": 2.20452618598938, + "learning_rate": 7.334699420910758e-06, + "loss": 0.8042, + "step": 1603 + }, + { + "epoch": 0.5877610846463907, + "grad_norm": 1.9438961744308472, + "learning_rate": 7.323564936983312e-06, + "loss": 0.8873, + "step": 1604 + }, + { + "epoch": 0.588127519237816, + "grad_norm": 2.1404733657836914, + "learning_rate": 7.312434026096813e-06, + "loss": 0.8349, + "step": 1605 + }, + { + "epoch": 0.5884939538292415, + "grad_norm": 1.8856126070022583, + "learning_rate": 7.301306703111017e-06, + "loss": 0.8748, + "step": 1606 + }, + { + "epoch": 0.5888603884206669, + "grad_norm": 2.095350980758667, + "learning_rate": 7.290182982880909e-06, + "loss": 0.7623, + "step": 1607 + }, + { + "epoch": 0.5892268230120924, + "grad_norm": 2.0845468044281006, + "learning_rate": 7.2790628802566556e-06, + "loss": 0.8288, + "step": 1608 + }, + { + "epoch": 0.5895932576035178, + "grad_norm": 2.0782248973846436, + "learning_rate": 7.26794641008359e-06, + "loss": 0.7994, + "step": 1609 + }, + { + "epoch": 0.5899596921949432, + "grad_norm": 2.1508421897888184, + "learning_rate": 7.256833587202208e-06, + "loss": 0.8597, + "step": 1610 + }, + { + "epoch": 0.5903261267863686, + "grad_norm": 2.304708242416382, + "learning_rate": 7.245724426448117e-06, + "loss": 0.8211, + "step": 1611 + }, + { + "epoch": 0.590692561377794, + "grad_norm": 2.2719075679779053, + "learning_rate": 7.234618942652057e-06, + "loss": 0.8149, + "step": 1612 + }, + { + "epoch": 0.5910589959692195, + "grad_norm": 1.967847466468811, + "learning_rate": 7.223517150639845e-06, + "loss": 0.8223, + "step": 1613 + }, + { + "epoch": 0.5914254305606449, + "grad_norm": 2.4015612602233887, + "learning_rate": 7.2124190652323765e-06, + "loss": 0.8032, + "step": 1614 + }, + { + "epoch": 0.5917918651520704, + "grad_norm": 2.075702667236328, + "learning_rate": 7.2013247012455935e-06, + "loss": 0.8662, + "step": 1615 + }, + { + "epoch": 0.5921582997434958, + "grad_norm": 2.115882158279419, + "learning_rate": 7.190234073490477e-06, + "loss": 0.8034, + "step": 1616 + }, + { + "epoch": 0.5925247343349213, + "grad_norm": 2.18595552444458, + "learning_rate": 7.179147196773008e-06, + "loss": 0.8682, + "step": 1617 + }, + { + "epoch": 0.5928911689263466, + "grad_norm": 2.383286952972412, + "learning_rate": 7.168064085894173e-06, + "loss": 0.7243, + "step": 1618 + }, + { + "epoch": 0.5932576035177721, + "grad_norm": 2.022106885910034, + "learning_rate": 7.156984755649925e-06, + "loss": 0.7914, + "step": 1619 + }, + { + "epoch": 0.5936240381091975, + "grad_norm": 2.2780921459198, + "learning_rate": 7.1459092208311685e-06, + "loss": 0.8125, + "step": 1620 + }, + { + "epoch": 0.5939904727006229, + "grad_norm": 2.199862003326416, + "learning_rate": 7.13483749622375e-06, + "loss": 0.805, + "step": 1621 + }, + { + "epoch": 0.5943569072920484, + "grad_norm": 2.0567820072174072, + "learning_rate": 7.12376959660841e-06, + "loss": 0.7983, + "step": 1622 + }, + { + "epoch": 0.5947233418834738, + "grad_norm": 2.0858473777770996, + "learning_rate": 7.112705536760802e-06, + "loss": 0.7851, + "step": 1623 + }, + { + "epoch": 0.5950897764748992, + "grad_norm": 1.8858219385147095, + "learning_rate": 7.1016453314514475e-06, + "loss": 0.8713, + "step": 1624 + }, + { + "epoch": 0.5954562110663246, + "grad_norm": 2.3013741970062256, + "learning_rate": 7.090588995445722e-06, + "loss": 0.8711, + "step": 1625 + }, + { + "epoch": 0.5958226456577501, + "grad_norm": 1.9575581550598145, + "learning_rate": 7.079536543503833e-06, + "loss": 0.876, + "step": 1626 + }, + { + "epoch": 0.5961890802491755, + "grad_norm": 1.8408575057983398, + "learning_rate": 7.0684879903808015e-06, + "loss": 0.8855, + "step": 1627 + }, + { + "epoch": 0.596555514840601, + "grad_norm": 2.0506277084350586, + "learning_rate": 7.057443350826447e-06, + "loss": 0.831, + "step": 1628 + }, + { + "epoch": 0.5969219494320264, + "grad_norm": 2.0059189796447754, + "learning_rate": 7.046402639585366e-06, + "loss": 0.7393, + "step": 1629 + }, + { + "epoch": 0.5972883840234519, + "grad_norm": 2.1671557426452637, + "learning_rate": 7.035365871396911e-06, + "loss": 0.8705, + "step": 1630 + }, + { + "epoch": 0.5976548186148772, + "grad_norm": 1.984377384185791, + "learning_rate": 7.024333060995165e-06, + "loss": 0.8358, + "step": 1631 + }, + { + "epoch": 0.5980212532063027, + "grad_norm": 2.277132034301758, + "learning_rate": 7.013304223108931e-06, + "loss": 0.7796, + "step": 1632 + }, + { + "epoch": 0.5983876877977281, + "grad_norm": 2.3955905437469482, + "learning_rate": 7.002279372461706e-06, + "loss": 0.7446, + "step": 1633 + }, + { + "epoch": 0.5987541223891535, + "grad_norm": 2.4661448001861572, + "learning_rate": 6.991258523771667e-06, + "loss": 0.8515, + "step": 1634 + }, + { + "epoch": 0.599120556980579, + "grad_norm": 2.060856342315674, + "learning_rate": 6.980241691751649e-06, + "loss": 0.8076, + "step": 1635 + }, + { + "epoch": 0.5994869915720044, + "grad_norm": 2.204679250717163, + "learning_rate": 6.9692288911091275e-06, + "loss": 0.7761, + "step": 1636 + }, + { + "epoch": 0.5998534261634298, + "grad_norm": 2.0762739181518555, + "learning_rate": 6.9582201365461845e-06, + "loss": 0.8523, + "step": 1637 + }, + { + "epoch": 0.6002198607548552, + "grad_norm": 2.0019896030426025, + "learning_rate": 6.94721544275951e-06, + "loss": 0.8053, + "step": 1638 + }, + { + "epoch": 0.6005862953462807, + "grad_norm": 1.9410783052444458, + "learning_rate": 6.936214824440369e-06, + "loss": 0.876, + "step": 1639 + }, + { + "epoch": 0.6009527299377061, + "grad_norm": 2.2371299266815186, + "learning_rate": 6.925218296274592e-06, + "loss": 0.7718, + "step": 1640 + }, + { + "epoch": 0.6013191645291316, + "grad_norm": 2.1076481342315674, + "learning_rate": 6.914225872942546e-06, + "loss": 0.8437, + "step": 1641 + }, + { + "epoch": 0.601685599120557, + "grad_norm": 2.0942487716674805, + "learning_rate": 6.903237569119108e-06, + "loss": 0.8823, + "step": 1642 + }, + { + "epoch": 0.6020520337119825, + "grad_norm": 2.637465238571167, + "learning_rate": 6.892253399473673e-06, + "loss": 0.7504, + "step": 1643 + }, + { + "epoch": 0.6024184683034078, + "grad_norm": 2.225482702255249, + "learning_rate": 6.881273378670102e-06, + "loss": 0.8458, + "step": 1644 + }, + { + "epoch": 0.6027849028948332, + "grad_norm": 2.0072765350341797, + "learning_rate": 6.870297521366725e-06, + "loss": 0.797, + "step": 1645 + }, + { + "epoch": 0.6031513374862587, + "grad_norm": 2.540889024734497, + "learning_rate": 6.859325842216314e-06, + "loss": 0.7914, + "step": 1646 + }, + { + "epoch": 0.6035177720776841, + "grad_norm": 2.056896686553955, + "learning_rate": 6.848358355866063e-06, + "loss": 0.7791, + "step": 1647 + }, + { + "epoch": 0.6038842066691096, + "grad_norm": 2.06502103805542, + "learning_rate": 6.83739507695756e-06, + "loss": 0.8494, + "step": 1648 + }, + { + "epoch": 0.604250641260535, + "grad_norm": 2.031855821609497, + "learning_rate": 6.82643602012679e-06, + "loss": 0.856, + "step": 1649 + }, + { + "epoch": 0.6046170758519605, + "grad_norm": 2.2233779430389404, + "learning_rate": 6.815481200004088e-06, + "loss": 0.8112, + "step": 1650 + }, + { + "epoch": 0.6049835104433858, + "grad_norm": 2.1673338413238525, + "learning_rate": 6.804530631214143e-06, + "loss": 0.8606, + "step": 1651 + }, + { + "epoch": 0.6053499450348113, + "grad_norm": 2.1959850788116455, + "learning_rate": 6.793584328375969e-06, + "loss": 0.7925, + "step": 1652 + }, + { + "epoch": 0.6057163796262367, + "grad_norm": 2.0298149585723877, + "learning_rate": 6.782642306102871e-06, + "loss": 0.8058, + "step": 1653 + }, + { + "epoch": 0.6060828142176622, + "grad_norm": 2.510085344314575, + "learning_rate": 6.771704579002456e-06, + "loss": 0.8086, + "step": 1654 + }, + { + "epoch": 0.6064492488090876, + "grad_norm": 2.2274227142333984, + "learning_rate": 6.760771161676589e-06, + "loss": 0.8525, + "step": 1655 + }, + { + "epoch": 0.6068156834005131, + "grad_norm": 2.153073310852051, + "learning_rate": 6.7498420687213806e-06, + "loss": 0.7919, + "step": 1656 + }, + { + "epoch": 0.6071821179919384, + "grad_norm": 2.1709446907043457, + "learning_rate": 6.738917314727178e-06, + "loss": 0.8229, + "step": 1657 + }, + { + "epoch": 0.6075485525833638, + "grad_norm": 2.3454272747039795, + "learning_rate": 6.7279969142785165e-06, + "loss": 0.7746, + "step": 1658 + }, + { + "epoch": 0.6079149871747893, + "grad_norm": 2.0328891277313232, + "learning_rate": 6.717080881954139e-06, + "loss": 0.8782, + "step": 1659 + }, + { + "epoch": 0.6082814217662147, + "grad_norm": 2.0833816528320312, + "learning_rate": 6.706169232326949e-06, + "loss": 0.8619, + "step": 1660 + }, + { + "epoch": 0.6086478563576402, + "grad_norm": 2.4952611923217773, + "learning_rate": 6.695261979963995e-06, + "loss": 0.7782, + "step": 1661 + }, + { + "epoch": 0.6090142909490656, + "grad_norm": 2.224849224090576, + "learning_rate": 6.6843591394264676e-06, + "loss": 0.7831, + "step": 1662 + }, + { + "epoch": 0.609380725540491, + "grad_norm": 1.9657704830169678, + "learning_rate": 6.673460725269649e-06, + "loss": 0.8668, + "step": 1663 + }, + { + "epoch": 0.6097471601319164, + "grad_norm": 2.173832416534424, + "learning_rate": 6.662566752042929e-06, + "loss": 0.8046, + "step": 1664 + }, + { + "epoch": 0.6101135947233419, + "grad_norm": 1.9357271194458008, + "learning_rate": 6.651677234289762e-06, + "loss": 0.8335, + "step": 1665 + }, + { + "epoch": 0.6104800293147673, + "grad_norm": 2.003981113433838, + "learning_rate": 6.640792186547654e-06, + "loss": 0.8166, + "step": 1666 + }, + { + "epoch": 0.6108464639061928, + "grad_norm": 2.0950145721435547, + "learning_rate": 6.629911623348147e-06, + "loss": 0.8494, + "step": 1667 + }, + { + "epoch": 0.6112128984976182, + "grad_norm": 1.9635558128356934, + "learning_rate": 6.619035559216786e-06, + "loss": 0.8174, + "step": 1668 + }, + { + "epoch": 0.6115793330890436, + "grad_norm": 1.9462409019470215, + "learning_rate": 6.608164008673121e-06, + "loss": 0.8574, + "step": 1669 + }, + { + "epoch": 0.611945767680469, + "grad_norm": 1.9010626077651978, + "learning_rate": 6.597296986230674e-06, + "loss": 0.8177, + "step": 1670 + }, + { + "epoch": 0.6123122022718944, + "grad_norm": 2.2767860889434814, + "learning_rate": 6.586434506396924e-06, + "loss": 0.7865, + "step": 1671 + }, + { + "epoch": 0.6126786368633199, + "grad_norm": 2.176220417022705, + "learning_rate": 6.575576583673277e-06, + "loss": 0.8348, + "step": 1672 + }, + { + "epoch": 0.6130450714547453, + "grad_norm": 1.862492561340332, + "learning_rate": 6.5647232325550594e-06, + "loss": 0.8615, + "step": 1673 + }, + { + "epoch": 0.6134115060461708, + "grad_norm": 2.045877695083618, + "learning_rate": 6.553874467531496e-06, + "loss": 0.8176, + "step": 1674 + }, + { + "epoch": 0.6137779406375962, + "grad_norm": 2.3825926780700684, + "learning_rate": 6.54303030308569e-06, + "loss": 0.8194, + "step": 1675 + }, + { + "epoch": 0.6141443752290217, + "grad_norm": 2.3617899417877197, + "learning_rate": 6.532190753694605e-06, + "loss": 0.7633, + "step": 1676 + }, + { + "epoch": 0.614510809820447, + "grad_norm": 2.279677391052246, + "learning_rate": 6.521355833829033e-06, + "loss": 0.8462, + "step": 1677 + }, + { + "epoch": 0.6148772444118725, + "grad_norm": 2.1296496391296387, + "learning_rate": 6.5105255579536e-06, + "loss": 0.8904, + "step": 1678 + }, + { + "epoch": 0.6152436790032979, + "grad_norm": 2.403574228286743, + "learning_rate": 6.499699940526718e-06, + "loss": 0.8135, + "step": 1679 + }, + { + "epoch": 0.6156101135947234, + "grad_norm": 2.0573787689208984, + "learning_rate": 6.488878996000589e-06, + "loss": 0.804, + "step": 1680 + }, + { + "epoch": 0.6159765481861488, + "grad_norm": 2.0022242069244385, + "learning_rate": 6.478062738821175e-06, + "loss": 0.8562, + "step": 1681 + }, + { + "epoch": 0.6163429827775742, + "grad_norm": 2.484389305114746, + "learning_rate": 6.467251183428182e-06, + "loss": 0.8543, + "step": 1682 + }, + { + "epoch": 0.6167094173689996, + "grad_norm": 2.20928692817688, + "learning_rate": 6.456444344255037e-06, + "loss": 0.8088, + "step": 1683 + }, + { + "epoch": 0.617075851960425, + "grad_norm": 2.1175923347473145, + "learning_rate": 6.4456422357288665e-06, + "loss": 0.8111, + "step": 1684 + }, + { + "epoch": 0.6174422865518505, + "grad_norm": 2.030487298965454, + "learning_rate": 6.434844872270486e-06, + "loss": 0.8055, + "step": 1685 + }, + { + "epoch": 0.6178087211432759, + "grad_norm": 2.0303640365600586, + "learning_rate": 6.42405226829438e-06, + "loss": 0.8603, + "step": 1686 + }, + { + "epoch": 0.6181751557347014, + "grad_norm": 2.0851595401763916, + "learning_rate": 6.413264438208675e-06, + "loss": 0.8305, + "step": 1687 + }, + { + "epoch": 0.6185415903261268, + "grad_norm": 2.7440991401672363, + "learning_rate": 6.402481396415123e-06, + "loss": 0.7822, + "step": 1688 + }, + { + "epoch": 0.6189080249175523, + "grad_norm": 2.3908801078796387, + "learning_rate": 6.391703157309088e-06, + "loss": 0.7986, + "step": 1689 + }, + { + "epoch": 0.6192744595089776, + "grad_norm": 2.0651395320892334, + "learning_rate": 6.380929735279514e-06, + "loss": 0.8235, + "step": 1690 + }, + { + "epoch": 0.6196408941004031, + "grad_norm": 2.590243339538574, + "learning_rate": 6.370161144708924e-06, + "loss": 0.6925, + "step": 1691 + }, + { + "epoch": 0.6200073286918285, + "grad_norm": 2.1181154251098633, + "learning_rate": 6.359397399973386e-06, + "loss": 0.8766, + "step": 1692 + }, + { + "epoch": 0.6203737632832539, + "grad_norm": 2.16276216506958, + "learning_rate": 6.348638515442506e-06, + "loss": 0.8357, + "step": 1693 + }, + { + "epoch": 0.6207401978746794, + "grad_norm": 2.131016969680786, + "learning_rate": 6.337884505479383e-06, + "loss": 0.8125, + "step": 1694 + }, + { + "epoch": 0.6211066324661048, + "grad_norm": 1.936014175415039, + "learning_rate": 6.327135384440633e-06, + "loss": 0.8784, + "step": 1695 + }, + { + "epoch": 0.6214730670575302, + "grad_norm": 2.428760290145874, + "learning_rate": 6.316391166676323e-06, + "loss": 0.7639, + "step": 1696 + }, + { + "epoch": 0.6218395016489556, + "grad_norm": 2.2411906719207764, + "learning_rate": 6.3056518665299895e-06, + "loss": 0.8401, + "step": 1697 + }, + { + "epoch": 0.6222059362403811, + "grad_norm": 3.147825241088867, + "learning_rate": 6.294917498338602e-06, + "loss": 0.747, + "step": 1698 + }, + { + "epoch": 0.6225723708318065, + "grad_norm": 2.061725616455078, + "learning_rate": 6.284188076432536e-06, + "loss": 0.8259, + "step": 1699 + }, + { + "epoch": 0.622938805423232, + "grad_norm": 2.19020676612854, + "learning_rate": 6.2734636151355735e-06, + "loss": 0.87, + "step": 1700 + }, + { + "epoch": 0.6233052400146574, + "grad_norm": 2.1691603660583496, + "learning_rate": 6.262744128764868e-06, + "loss": 0.8505, + "step": 1701 + }, + { + "epoch": 0.6236716746060829, + "grad_norm": 2.165039300918579, + "learning_rate": 6.252029631630938e-06, + "loss": 0.7862, + "step": 1702 + }, + { + "epoch": 0.6240381091975082, + "grad_norm": 2.140118360519409, + "learning_rate": 6.2413201380376395e-06, + "loss": 0.8015, + "step": 1703 + }, + { + "epoch": 0.6244045437889337, + "grad_norm": 1.9693007469177246, + "learning_rate": 6.23061566228214e-06, + "loss": 0.8262, + "step": 1704 + }, + { + "epoch": 0.6247709783803591, + "grad_norm": 2.224019765853882, + "learning_rate": 6.21991621865492e-06, + "loss": 0.8128, + "step": 1705 + }, + { + "epoch": 0.6251374129717845, + "grad_norm": 2.5416271686553955, + "learning_rate": 6.209221821439737e-06, + "loss": 0.8195, + "step": 1706 + }, + { + "epoch": 0.62550384756321, + "grad_norm": 1.9668278694152832, + "learning_rate": 6.1985324849136094e-06, + "loss": 0.8628, + "step": 1707 + }, + { + "epoch": 0.6258702821546354, + "grad_norm": 2.0069453716278076, + "learning_rate": 6.187848223346804e-06, + "loss": 0.8441, + "step": 1708 + }, + { + "epoch": 0.6262367167460609, + "grad_norm": 2.186694383621216, + "learning_rate": 6.177169051002815e-06, + "loss": 0.8359, + "step": 1709 + }, + { + "epoch": 0.6266031513374862, + "grad_norm": 2.1202750205993652, + "learning_rate": 6.166494982138328e-06, + "loss": 0.813, + "step": 1710 + }, + { + "epoch": 0.6269695859289117, + "grad_norm": 2.4543404579162598, + "learning_rate": 6.155826031003234e-06, + "loss": 0.837, + "step": 1711 + }, + { + "epoch": 0.6273360205203371, + "grad_norm": 2.054497003555298, + "learning_rate": 6.145162211840576e-06, + "loss": 0.8348, + "step": 1712 + }, + { + "epoch": 0.6277024551117626, + "grad_norm": 2.0176241397857666, + "learning_rate": 6.134503538886558e-06, + "loss": 0.7921, + "step": 1713 + }, + { + "epoch": 0.628068889703188, + "grad_norm": 2.011383533477783, + "learning_rate": 6.123850026370512e-06, + "loss": 0.8338, + "step": 1714 + }, + { + "epoch": 0.6284353242946135, + "grad_norm": 2.33731746673584, + "learning_rate": 6.113201688514872e-06, + "loss": 0.785, + "step": 1715 + }, + { + "epoch": 0.6288017588860388, + "grad_norm": 2.0671072006225586, + "learning_rate": 6.102558539535168e-06, + "loss": 0.8809, + "step": 1716 + }, + { + "epoch": 0.6291681934774642, + "grad_norm": 2.25299334526062, + "learning_rate": 6.091920593640013e-06, + "loss": 0.8024, + "step": 1717 + }, + { + "epoch": 0.6295346280688897, + "grad_norm": 1.911569356918335, + "learning_rate": 6.081287865031056e-06, + "loss": 0.8447, + "step": 1718 + }, + { + "epoch": 0.6299010626603151, + "grad_norm": 1.5879853963851929, + "learning_rate": 6.070660367902998e-06, + "loss": 0.5188, + "step": 1719 + }, + { + "epoch": 0.6302674972517406, + "grad_norm": 1.9963817596435547, + "learning_rate": 6.0600381164435405e-06, + "loss": 0.8051, + "step": 1720 + }, + { + "epoch": 0.630633931843166, + "grad_norm": 2.1238741874694824, + "learning_rate": 6.049421124833395e-06, + "loss": 0.84, + "step": 1721 + }, + { + "epoch": 0.6310003664345915, + "grad_norm": 2.239947557449341, + "learning_rate": 6.0388094072462465e-06, + "loss": 0.8142, + "step": 1722 + }, + { + "epoch": 0.6313668010260168, + "grad_norm": 2.5329320430755615, + "learning_rate": 6.028202977848734e-06, + "loss": 0.7447, + "step": 1723 + }, + { + "epoch": 0.6317332356174423, + "grad_norm": 2.675870418548584, + "learning_rate": 6.017601850800447e-06, + "loss": 0.8132, + "step": 1724 + }, + { + "epoch": 0.6320996702088677, + "grad_norm": 1.9416941404342651, + "learning_rate": 6.007006040253885e-06, + "loss": 0.824, + "step": 1725 + }, + { + "epoch": 0.6324661048002932, + "grad_norm": 2.223480224609375, + "learning_rate": 5.996415560354456e-06, + "loss": 0.8814, + "step": 1726 + }, + { + "epoch": 0.6328325393917186, + "grad_norm": 1.969476342201233, + "learning_rate": 5.985830425240455e-06, + "loss": 0.8341, + "step": 1727 + }, + { + "epoch": 0.633198973983144, + "grad_norm": 1.9697409868240356, + "learning_rate": 5.975250649043039e-06, + "loss": 0.915, + "step": 1728 + }, + { + "epoch": 0.6335654085745694, + "grad_norm": 2.4558944702148438, + "learning_rate": 5.964676245886205e-06, + "loss": 0.8503, + "step": 1729 + }, + { + "epoch": 0.6339318431659948, + "grad_norm": 1.9908795356750488, + "learning_rate": 5.954107229886784e-06, + "loss": 0.8058, + "step": 1730 + }, + { + "epoch": 0.6342982777574203, + "grad_norm": 2.4454500675201416, + "learning_rate": 5.94354361515441e-06, + "loss": 0.7536, + "step": 1731 + }, + { + "epoch": 0.6346647123488457, + "grad_norm": 2.3198952674865723, + "learning_rate": 5.932985415791513e-06, + "loss": 0.8192, + "step": 1732 + }, + { + "epoch": 0.6350311469402712, + "grad_norm": 1.959184169769287, + "learning_rate": 5.92243264589329e-06, + "loss": 0.8082, + "step": 1733 + }, + { + "epoch": 0.6353975815316966, + "grad_norm": 2.3622608184814453, + "learning_rate": 5.911885319547688e-06, + "loss": 0.8634, + "step": 1734 + }, + { + "epoch": 0.635764016123122, + "grad_norm": 2.3816590309143066, + "learning_rate": 5.901343450835388e-06, + "loss": 0.8062, + "step": 1735 + }, + { + "epoch": 0.6361304507145474, + "grad_norm": 2.5151548385620117, + "learning_rate": 5.890807053829784e-06, + "loss": 0.757, + "step": 1736 + }, + { + "epoch": 0.6364968853059729, + "grad_norm": 2.0787878036499023, + "learning_rate": 5.8802761425969654e-06, + "loss": 0.8356, + "step": 1737 + }, + { + "epoch": 0.6368633198973983, + "grad_norm": 2.1201980113983154, + "learning_rate": 5.869750731195702e-06, + "loss": 0.9005, + "step": 1738 + }, + { + "epoch": 0.6372297544888238, + "grad_norm": 2.0525357723236084, + "learning_rate": 5.859230833677423e-06, + "loss": 0.8693, + "step": 1739 + }, + { + "epoch": 0.6375961890802492, + "grad_norm": 2.3157238960266113, + "learning_rate": 5.848716464086184e-06, + "loss": 0.8153, + "step": 1740 + }, + { + "epoch": 0.6379626236716746, + "grad_norm": 2.4236457347869873, + "learning_rate": 5.838207636458674e-06, + "loss": 0.8364, + "step": 1741 + }, + { + "epoch": 0.6383290582631, + "grad_norm": 2.645217180252075, + "learning_rate": 5.8277043648241735e-06, + "loss": 0.7743, + "step": 1742 + }, + { + "epoch": 0.6386954928545254, + "grad_norm": 2.5812301635742188, + "learning_rate": 5.8172066632045535e-06, + "loss": 0.8233, + "step": 1743 + }, + { + "epoch": 0.6390619274459509, + "grad_norm": 2.15390944480896, + "learning_rate": 5.8067145456142475e-06, + "loss": 0.8209, + "step": 1744 + }, + { + "epoch": 0.6394283620373763, + "grad_norm": 2.439054012298584, + "learning_rate": 5.796228026060235e-06, + "loss": 0.744, + "step": 1745 + }, + { + "epoch": 0.6397947966288018, + "grad_norm": 2.1751368045806885, + "learning_rate": 5.785747118542013e-06, + "loss": 0.7693, + "step": 1746 + }, + { + "epoch": 0.6401612312202272, + "grad_norm": 2.5186259746551514, + "learning_rate": 5.7752718370515985e-06, + "loss": 0.7651, + "step": 1747 + }, + { + "epoch": 0.6405276658116527, + "grad_norm": 2.056795835494995, + "learning_rate": 5.764802195573495e-06, + "loss": 0.8122, + "step": 1748 + }, + { + "epoch": 0.640894100403078, + "grad_norm": 2.7081797122955322, + "learning_rate": 5.7543382080846675e-06, + "loss": 0.6878, + "step": 1749 + }, + { + "epoch": 0.6412605349945035, + "grad_norm": 2.0384063720703125, + "learning_rate": 5.7438798885545475e-06, + "loss": 0.8603, + "step": 1750 + }, + { + "epoch": 0.6416269695859289, + "grad_norm": 2.258711099624634, + "learning_rate": 5.733427250944985e-06, + "loss": 0.7606, + "step": 1751 + }, + { + "epoch": 0.6419934041773543, + "grad_norm": 2.474824905395508, + "learning_rate": 5.722980309210253e-06, + "loss": 0.7513, + "step": 1752 + }, + { + "epoch": 0.6423598387687798, + "grad_norm": 1.9493567943572998, + "learning_rate": 5.712539077297023e-06, + "loss": 0.8404, + "step": 1753 + }, + { + "epoch": 0.6427262733602052, + "grad_norm": 1.9995802640914917, + "learning_rate": 5.7021035691443375e-06, + "loss": 0.8871, + "step": 1754 + }, + { + "epoch": 0.6430927079516306, + "grad_norm": 2.001211404800415, + "learning_rate": 5.691673798683606e-06, + "loss": 0.8844, + "step": 1755 + }, + { + "epoch": 0.643459142543056, + "grad_norm": 1.9877080917358398, + "learning_rate": 5.681249779838563e-06, + "loss": 0.77, + "step": 1756 + }, + { + "epoch": 0.6438255771344815, + "grad_norm": 2.3511409759521484, + "learning_rate": 5.670831526525279e-06, + "loss": 0.7055, + "step": 1757 + }, + { + "epoch": 0.6441920117259069, + "grad_norm": 2.2715601921081543, + "learning_rate": 5.660419052652123e-06, + "loss": 0.8774, + "step": 1758 + }, + { + "epoch": 0.6445584463173324, + "grad_norm": 2.2003986835479736, + "learning_rate": 5.650012372119751e-06, + "loss": 0.7563, + "step": 1759 + }, + { + "epoch": 0.6449248809087578, + "grad_norm": 2.1989638805389404, + "learning_rate": 5.639611498821076e-06, + "loss": 0.8778, + "step": 1760 + }, + { + "epoch": 0.6452913155001833, + "grad_norm": 2.143601417541504, + "learning_rate": 5.62921644664127e-06, + "loss": 0.8044, + "step": 1761 + }, + { + "epoch": 0.6456577500916086, + "grad_norm": 2.3783957958221436, + "learning_rate": 5.618827229457722e-06, + "loss": 0.808, + "step": 1762 + }, + { + "epoch": 0.6460241846830341, + "grad_norm": 2.843330144882202, + "learning_rate": 5.60844386114004e-06, + "loss": 0.7242, + "step": 1763 + }, + { + "epoch": 0.6463906192744595, + "grad_norm": 2.4551148414611816, + "learning_rate": 5.598066355550023e-06, + "loss": 0.8453, + "step": 1764 + }, + { + "epoch": 0.6467570538658849, + "grad_norm": 2.3557980060577393, + "learning_rate": 5.587694726541645e-06, + "loss": 0.8925, + "step": 1765 + }, + { + "epoch": 0.6471234884573104, + "grad_norm": 2.076031446456909, + "learning_rate": 5.577328987961022e-06, + "loss": 0.8178, + "step": 1766 + }, + { + "epoch": 0.6474899230487358, + "grad_norm": 2.304417371749878, + "learning_rate": 5.56696915364642e-06, + "loss": 0.8694, + "step": 1767 + }, + { + "epoch": 0.6478563576401613, + "grad_norm": 1.8749827146530151, + "learning_rate": 5.556615237428219e-06, + "loss": 0.8058, + "step": 1768 + }, + { + "epoch": 0.6482227922315866, + "grad_norm": 2.451500654220581, + "learning_rate": 5.546267253128897e-06, + "loss": 0.9056, + "step": 1769 + }, + { + "epoch": 0.6485892268230121, + "grad_norm": 2.123460531234741, + "learning_rate": 5.5359252145630186e-06, + "loss": 0.7229, + "step": 1770 + }, + { + "epoch": 0.6489556614144375, + "grad_norm": 2.1700215339660645, + "learning_rate": 5.525589135537195e-06, + "loss": 0.8278, + "step": 1771 + }, + { + "epoch": 0.649322096005863, + "grad_norm": 2.158203125, + "learning_rate": 5.515259029850104e-06, + "loss": 0.853, + "step": 1772 + }, + { + "epoch": 0.6496885305972884, + "grad_norm": 2.275250196456909, + "learning_rate": 5.504934911292428e-06, + "loss": 0.8514, + "step": 1773 + }, + { + "epoch": 0.6500549651887139, + "grad_norm": 2.0677850246429443, + "learning_rate": 5.494616793646867e-06, + "loss": 0.8981, + "step": 1774 + }, + { + "epoch": 0.6504213997801392, + "grad_norm": 2.440816640853882, + "learning_rate": 5.484304690688113e-06, + "loss": 0.8005, + "step": 1775 + }, + { + "epoch": 0.6507878343715646, + "grad_norm": 2.1558449268341064, + "learning_rate": 5.473998616182825e-06, + "loss": 0.8014, + "step": 1776 + }, + { + "epoch": 0.6511542689629901, + "grad_norm": 2.008288860321045, + "learning_rate": 5.463698583889605e-06, + "loss": 0.8639, + "step": 1777 + }, + { + "epoch": 0.6515207035544155, + "grad_norm": 2.2373805046081543, + "learning_rate": 5.4534046075590016e-06, + "loss": 0.8344, + "step": 1778 + }, + { + "epoch": 0.651887138145841, + "grad_norm": 2.244774580001831, + "learning_rate": 5.443116700933475e-06, + "loss": 0.7931, + "step": 1779 + }, + { + "epoch": 0.6522535727372664, + "grad_norm": 2.13997745513916, + "learning_rate": 5.432834877747377e-06, + "loss": 0.8223, + "step": 1780 + }, + { + "epoch": 0.6526200073286919, + "grad_norm": 2.2929978370666504, + "learning_rate": 5.422559151726952e-06, + "loss": 0.8176, + "step": 1781 + }, + { + "epoch": 0.6529864419201172, + "grad_norm": 2.4449853897094727, + "learning_rate": 5.41228953659028e-06, + "loss": 0.8079, + "step": 1782 + }, + { + "epoch": 0.6533528765115427, + "grad_norm": 2.67828106880188, + "learning_rate": 5.402026046047311e-06, + "loss": 0.8899, + "step": 1783 + }, + { + "epoch": 0.6537193111029681, + "grad_norm": 2.5562262535095215, + "learning_rate": 5.391768693799795e-06, + "loss": 0.7825, + "step": 1784 + }, + { + "epoch": 0.6540857456943936, + "grad_norm": 2.0743002891540527, + "learning_rate": 5.381517493541301e-06, + "loss": 0.8028, + "step": 1785 + }, + { + "epoch": 0.654452180285819, + "grad_norm": 1.9827677011489868, + "learning_rate": 5.371272458957188e-06, + "loss": 0.8346, + "step": 1786 + }, + { + "epoch": 0.6548186148772445, + "grad_norm": 2.188796281814575, + "learning_rate": 5.361033603724571e-06, + "loss": 0.8082, + "step": 1787 + }, + { + "epoch": 0.6551850494686698, + "grad_norm": 2.5796215534210205, + "learning_rate": 5.350800941512321e-06, + "loss": 0.825, + "step": 1788 + }, + { + "epoch": 0.6555514840600952, + "grad_norm": 2.0983974933624268, + "learning_rate": 5.340574485981046e-06, + "loss": 0.8148, + "step": 1789 + }, + { + "epoch": 0.6559179186515207, + "grad_norm": 2.4515507221221924, + "learning_rate": 5.3303542507830655e-06, + "loss": 0.7716, + "step": 1790 + }, + { + "epoch": 0.6562843532429461, + "grad_norm": 2.2113089561462402, + "learning_rate": 5.320140249562396e-06, + "loss": 0.8046, + "step": 1791 + }, + { + "epoch": 0.6566507878343716, + "grad_norm": 2.609797716140747, + "learning_rate": 5.309932495954722e-06, + "loss": 0.8499, + "step": 1792 + }, + { + "epoch": 0.657017222425797, + "grad_norm": 2.038630247116089, + "learning_rate": 5.2997310035874e-06, + "loss": 0.7718, + "step": 1793 + }, + { + "epoch": 0.6573836570172225, + "grad_norm": 1.9817092418670654, + "learning_rate": 5.289535786079425e-06, + "loss": 0.7688, + "step": 1794 + }, + { + "epoch": 0.6577500916086478, + "grad_norm": 2.002600908279419, + "learning_rate": 5.279346857041408e-06, + "loss": 0.8807, + "step": 1795 + }, + { + "epoch": 0.6581165262000733, + "grad_norm": 2.2284350395202637, + "learning_rate": 5.269164230075574e-06, + "loss": 0.7766, + "step": 1796 + }, + { + "epoch": 0.6584829607914987, + "grad_norm": 2.0263407230377197, + "learning_rate": 5.258987918775725e-06, + "loss": 0.8194, + "step": 1797 + }, + { + "epoch": 0.6588493953829242, + "grad_norm": 2.078047752380371, + "learning_rate": 5.248817936727242e-06, + "loss": 0.8122, + "step": 1798 + }, + { + "epoch": 0.6592158299743496, + "grad_norm": 2.666020154953003, + "learning_rate": 5.238654297507048e-06, + "loss": 0.7913, + "step": 1799 + }, + { + "epoch": 0.659582264565775, + "grad_norm": 2.093247890472412, + "learning_rate": 5.228497014683604e-06, + "loss": 0.8071, + "step": 1800 + }, + { + "epoch": 0.6599486991572004, + "grad_norm": 2.7277653217315674, + "learning_rate": 5.218346101816884e-06, + "loss": 0.733, + "step": 1801 + }, + { + "epoch": 0.6603151337486258, + "grad_norm": 2.3742880821228027, + "learning_rate": 5.20820157245836e-06, + "loss": 0.8094, + "step": 1802 + }, + { + "epoch": 0.6606815683400513, + "grad_norm": 2.258880615234375, + "learning_rate": 5.198063440150972e-06, + "loss": 0.8041, + "step": 1803 + }, + { + "epoch": 0.6610480029314767, + "grad_norm": 2.546694040298462, + "learning_rate": 5.187931718429129e-06, + "loss": 0.779, + "step": 1804 + }, + { + "epoch": 0.6614144375229022, + "grad_norm": 2.5752904415130615, + "learning_rate": 5.177806420818688e-06, + "loss": 0.8587, + "step": 1805 + }, + { + "epoch": 0.6617808721143276, + "grad_norm": 2.5327069759368896, + "learning_rate": 5.167687560836908e-06, + "loss": 0.7962, + "step": 1806 + }, + { + "epoch": 0.6621473067057531, + "grad_norm": 2.1464478969573975, + "learning_rate": 5.15757515199248e-06, + "loss": 0.8641, + "step": 1807 + }, + { + "epoch": 0.6625137412971784, + "grad_norm": 2.343508005142212, + "learning_rate": 5.147469207785463e-06, + "loss": 0.842, + "step": 1808 + }, + { + "epoch": 0.6628801758886039, + "grad_norm": 2.575166940689087, + "learning_rate": 5.137369741707293e-06, + "loss": 0.7338, + "step": 1809 + }, + { + "epoch": 0.6632466104800293, + "grad_norm": 2.181368827819824, + "learning_rate": 5.127276767240761e-06, + "loss": 0.7625, + "step": 1810 + }, + { + "epoch": 0.6636130450714548, + "grad_norm": 2.4319188594818115, + "learning_rate": 5.117190297859986e-06, + "loss": 0.7884, + "step": 1811 + }, + { + "epoch": 0.6639794796628802, + "grad_norm": 2.30359148979187, + "learning_rate": 5.107110347030413e-06, + "loss": 0.755, + "step": 1812 + }, + { + "epoch": 0.6643459142543056, + "grad_norm": 2.1688833236694336, + "learning_rate": 5.0970369282087654e-06, + "loss": 0.7901, + "step": 1813 + }, + { + "epoch": 0.664712348845731, + "grad_norm": 2.0516762733459473, + "learning_rate": 5.086970054843061e-06, + "loss": 0.8528, + "step": 1814 + }, + { + "epoch": 0.6650787834371564, + "grad_norm": 2.297912836074829, + "learning_rate": 5.07690974037258e-06, + "loss": 0.8266, + "step": 1815 + }, + { + "epoch": 0.6654452180285819, + "grad_norm": 2.228154420852661, + "learning_rate": 5.0668559982278396e-06, + "loss": 0.8252, + "step": 1816 + }, + { + "epoch": 0.6658116526200073, + "grad_norm": 2.3292181491851807, + "learning_rate": 5.056808841830592e-06, + "loss": 0.8072, + "step": 1817 + }, + { + "epoch": 0.6661780872114328, + "grad_norm": 2.572441577911377, + "learning_rate": 5.046768284593786e-06, + "loss": 0.782, + "step": 1818 + }, + { + "epoch": 0.6665445218028582, + "grad_norm": 2.1662983894348145, + "learning_rate": 5.036734339921563e-06, + "loss": 0.7953, + "step": 1819 + }, + { + "epoch": 0.6669109563942837, + "grad_norm": 1.9589260816574097, + "learning_rate": 5.026707021209243e-06, + "loss": 0.8491, + "step": 1820 + }, + { + "epoch": 0.667277390985709, + "grad_norm": 2.5949389934539795, + "learning_rate": 5.016686341843298e-06, + "loss": 0.7735, + "step": 1821 + }, + { + "epoch": 0.6676438255771345, + "grad_norm": 2.1354405879974365, + "learning_rate": 5.0066723152013375e-06, + "loss": 0.8408, + "step": 1822 + }, + { + "epoch": 0.6680102601685599, + "grad_norm": 2.4556338787078857, + "learning_rate": 4.996664954652082e-06, + "loss": 0.8595, + "step": 1823 + }, + { + "epoch": 0.6683766947599853, + "grad_norm": 2.425250291824341, + "learning_rate": 4.986664273555361e-06, + "loss": 0.8545, + "step": 1824 + }, + { + "epoch": 0.6687431293514108, + "grad_norm": 2.1328673362731934, + "learning_rate": 4.976670285262084e-06, + "loss": 0.84, + "step": 1825 + }, + { + "epoch": 0.6691095639428362, + "grad_norm": 2.421682834625244, + "learning_rate": 4.966683003114226e-06, + "loss": 0.8028, + "step": 1826 + }, + { + "epoch": 0.6694759985342617, + "grad_norm": 2.466822385787964, + "learning_rate": 4.956702440444815e-06, + "loss": 0.8076, + "step": 1827 + }, + { + "epoch": 0.669842433125687, + "grad_norm": 2.335223436355591, + "learning_rate": 4.9467286105778914e-06, + "loss": 0.8457, + "step": 1828 + }, + { + "epoch": 0.6702088677171125, + "grad_norm": 2.1934590339660645, + "learning_rate": 4.936761526828528e-06, + "loss": 0.7996, + "step": 1829 + }, + { + "epoch": 0.6705753023085379, + "grad_norm": 2.3251702785491943, + "learning_rate": 4.926801202502777e-06, + "loss": 0.8577, + "step": 1830 + }, + { + "epoch": 0.6709417368999634, + "grad_norm": 1.9324642419815063, + "learning_rate": 4.916847650897671e-06, + "loss": 0.7787, + "step": 1831 + }, + { + "epoch": 0.6713081714913888, + "grad_norm": 2.155264139175415, + "learning_rate": 4.906900885301204e-06, + "loss": 0.822, + "step": 1832 + }, + { + "epoch": 0.6716746060828143, + "grad_norm": 2.3645036220550537, + "learning_rate": 4.896960918992314e-06, + "loss": 0.7823, + "step": 1833 + }, + { + "epoch": 0.6720410406742396, + "grad_norm": 2.374627113342285, + "learning_rate": 4.887027765240847e-06, + "loss": 0.7844, + "step": 1834 + }, + { + "epoch": 0.6724074752656651, + "grad_norm": 2.294753074645996, + "learning_rate": 4.877101437307567e-06, + "loss": 0.8298, + "step": 1835 + }, + { + "epoch": 0.6727739098570905, + "grad_norm": 2.3122475147247314, + "learning_rate": 4.867181948444124e-06, + "loss": 0.7986, + "step": 1836 + }, + { + "epoch": 0.6731403444485159, + "grad_norm": 2.2505087852478027, + "learning_rate": 4.8572693118930355e-06, + "loss": 0.8444, + "step": 1837 + }, + { + "epoch": 0.6735067790399414, + "grad_norm": 2.7303593158721924, + "learning_rate": 4.847363540887676e-06, + "loss": 0.7972, + "step": 1838 + }, + { + "epoch": 0.6738732136313668, + "grad_norm": 2.4003570079803467, + "learning_rate": 4.837464648652241e-06, + "loss": 0.8782, + "step": 1839 + }, + { + "epoch": 0.6742396482227923, + "grad_norm": 2.2989132404327393, + "learning_rate": 4.827572648401761e-06, + "loss": 0.8155, + "step": 1840 + }, + { + "epoch": 0.6746060828142176, + "grad_norm": 2.5501468181610107, + "learning_rate": 4.817687553342049e-06, + "loss": 0.8128, + "step": 1841 + }, + { + "epoch": 0.6749725174056431, + "grad_norm": 2.325488328933716, + "learning_rate": 4.8078093766697135e-06, + "loss": 0.7826, + "step": 1842 + }, + { + "epoch": 0.6753389519970685, + "grad_norm": 1.9862393140792847, + "learning_rate": 4.7979381315721206e-06, + "loss": 0.8583, + "step": 1843 + }, + { + "epoch": 0.675705386588494, + "grad_norm": 2.321089267730713, + "learning_rate": 4.788073831227377e-06, + "loss": 0.827, + "step": 1844 + }, + { + "epoch": 0.6760718211799194, + "grad_norm": 2.392892360687256, + "learning_rate": 4.778216488804329e-06, + "loss": 0.8084, + "step": 1845 + }, + { + "epoch": 0.6764382557713449, + "grad_norm": 2.309082269668579, + "learning_rate": 4.768366117462527e-06, + "loss": 0.8634, + "step": 1846 + }, + { + "epoch": 0.6768046903627702, + "grad_norm": 2.0040342807769775, + "learning_rate": 4.758522730352221e-06, + "loss": 0.8197, + "step": 1847 + }, + { + "epoch": 0.6771711249541956, + "grad_norm": 2.4351770877838135, + "learning_rate": 4.7486863406143335e-06, + "loss": 0.8073, + "step": 1848 + }, + { + "epoch": 0.6775375595456211, + "grad_norm": 2.468825101852417, + "learning_rate": 4.738856961380438e-06, + "loss": 0.8616, + "step": 1849 + }, + { + "epoch": 0.6779039941370465, + "grad_norm": 2.3825764656066895, + "learning_rate": 4.729034605772762e-06, + "loss": 0.7066, + "step": 1850 + }, + { + "epoch": 0.678270428728472, + "grad_norm": 1.9712620973587036, + "learning_rate": 4.719219286904154e-06, + "loss": 0.846, + "step": 1851 + }, + { + "epoch": 0.6786368633198974, + "grad_norm": 2.075200319290161, + "learning_rate": 4.709411017878057e-06, + "loss": 0.8984, + "step": 1852 + }, + { + "epoch": 0.6790032979113229, + "grad_norm": 2.2839746475219727, + "learning_rate": 4.699609811788518e-06, + "loss": 0.7861, + "step": 1853 + }, + { + "epoch": 0.6793697325027482, + "grad_norm": 2.22563099861145, + "learning_rate": 4.68981568172014e-06, + "loss": 0.8667, + "step": 1854 + }, + { + "epoch": 0.6797361670941737, + "grad_norm": 2.270456314086914, + "learning_rate": 4.6800286407480925e-06, + "loss": 0.7964, + "step": 1855 + }, + { + "epoch": 0.6801026016855991, + "grad_norm": 2.176010847091675, + "learning_rate": 4.670248701938073e-06, + "loss": 0.8349, + "step": 1856 + }, + { + "epoch": 0.6804690362770246, + "grad_norm": 2.3070919513702393, + "learning_rate": 4.660475878346302e-06, + "loss": 0.793, + "step": 1857 + }, + { + "epoch": 0.68083547086845, + "grad_norm": 2.3936715126037598, + "learning_rate": 4.650710183019503e-06, + "loss": 0.8097, + "step": 1858 + }, + { + "epoch": 0.6812019054598755, + "grad_norm": 2.1490304470062256, + "learning_rate": 4.640951628994873e-06, + "loss": 0.7925, + "step": 1859 + }, + { + "epoch": 0.6815683400513008, + "grad_norm": 2.260568857192993, + "learning_rate": 4.631200229300086e-06, + "loss": 0.8572, + "step": 1860 + }, + { + "epoch": 0.6819347746427262, + "grad_norm": 2.387512683868408, + "learning_rate": 4.621455996953258e-06, + "loss": 0.7959, + "step": 1861 + }, + { + "epoch": 0.6823012092341517, + "grad_norm": 2.167491912841797, + "learning_rate": 4.611718944962943e-06, + "loss": 0.8902, + "step": 1862 + }, + { + "epoch": 0.6826676438255771, + "grad_norm": 2.0454623699188232, + "learning_rate": 4.601989086328108e-06, + "loss": 0.8615, + "step": 1863 + }, + { + "epoch": 0.6830340784170026, + "grad_norm": 2.572718381881714, + "learning_rate": 4.592266434038113e-06, + "loss": 0.81, + "step": 1864 + }, + { + "epoch": 0.683400513008428, + "grad_norm": 2.236222743988037, + "learning_rate": 4.582551001072693e-06, + "loss": 0.8717, + "step": 1865 + }, + { + "epoch": 0.6837669475998535, + "grad_norm": 2.2675299644470215, + "learning_rate": 4.572842800401955e-06, + "loss": 0.8476, + "step": 1866 + }, + { + "epoch": 0.6841333821912788, + "grad_norm": 2.0991828441619873, + "learning_rate": 4.5631418449863505e-06, + "loss": 0.7759, + "step": 1867 + }, + { + "epoch": 0.6844998167827043, + "grad_norm": 2.1412177085876465, + "learning_rate": 4.5534481477766536e-06, + "loss": 0.7955, + "step": 1868 + }, + { + "epoch": 0.6848662513741297, + "grad_norm": 2.0972344875335693, + "learning_rate": 4.543761721713953e-06, + "loss": 0.8447, + "step": 1869 + }, + { + "epoch": 0.6852326859655552, + "grad_norm": 2.348965644836426, + "learning_rate": 4.534082579729623e-06, + "loss": 0.8297, + "step": 1870 + }, + { + "epoch": 0.6855991205569806, + "grad_norm": 2.3317277431488037, + "learning_rate": 4.524410734745322e-06, + "loss": 0.791, + "step": 1871 + }, + { + "epoch": 0.685965555148406, + "grad_norm": 2.5072484016418457, + "learning_rate": 4.514746199672961e-06, + "loss": 0.8313, + "step": 1872 + }, + { + "epoch": 0.6863319897398314, + "grad_norm": 2.189089059829712, + "learning_rate": 4.5050889874146966e-06, + "loss": 0.8015, + "step": 1873 + }, + { + "epoch": 0.6866984243312568, + "grad_norm": 2.1472368240356445, + "learning_rate": 4.495439110862912e-06, + "loss": 0.809, + "step": 1874 + }, + { + "epoch": 0.6870648589226823, + "grad_norm": 2.632460832595825, + "learning_rate": 4.4857965829001885e-06, + "loss": 0.8205, + "step": 1875 + }, + { + "epoch": 0.6874312935141077, + "grad_norm": 2.7049458026885986, + "learning_rate": 4.4761614163992975e-06, + "loss": 0.7543, + "step": 1876 + }, + { + "epoch": 0.6877977281055332, + "grad_norm": 2.1904704570770264, + "learning_rate": 4.46653362422319e-06, + "loss": 0.8536, + "step": 1877 + }, + { + "epoch": 0.6881641626969586, + "grad_norm": 2.2160379886627197, + "learning_rate": 4.4569132192249705e-06, + "loss": 0.7863, + "step": 1878 + }, + { + "epoch": 0.6885305972883841, + "grad_norm": 2.341730833053589, + "learning_rate": 4.447300214247884e-06, + "loss": 0.8412, + "step": 1879 + }, + { + "epoch": 0.6888970318798094, + "grad_norm": 2.155663013458252, + "learning_rate": 4.437694622125282e-06, + "loss": 0.8579, + "step": 1880 + }, + { + "epoch": 0.6892634664712349, + "grad_norm": 2.2758772373199463, + "learning_rate": 4.428096455680638e-06, + "loss": 0.8193, + "step": 1881 + }, + { + "epoch": 0.6896299010626603, + "grad_norm": 2.211446762084961, + "learning_rate": 4.4185057277275025e-06, + "loss": 0.7861, + "step": 1882 + }, + { + "epoch": 0.6899963356540857, + "grad_norm": 2.061429977416992, + "learning_rate": 4.408922451069499e-06, + "loss": 0.8447, + "step": 1883 + }, + { + "epoch": 0.6903627702455112, + "grad_norm": 2.347288131713867, + "learning_rate": 4.399346638500305e-06, + "loss": 0.7768, + "step": 1884 + }, + { + "epoch": 0.6907292048369366, + "grad_norm": 2.0781826972961426, + "learning_rate": 4.3897783028036235e-06, + "loss": 0.8566, + "step": 1885 + }, + { + "epoch": 0.691095639428362, + "grad_norm": 2.022632360458374, + "learning_rate": 4.380217456753191e-06, + "loss": 0.8002, + "step": 1886 + }, + { + "epoch": 0.6914620740197874, + "grad_norm": 2.080552101135254, + "learning_rate": 4.370664113112729e-06, + "loss": 0.87, + "step": 1887 + }, + { + "epoch": 0.6918285086112129, + "grad_norm": 2.203587770462036, + "learning_rate": 4.361118284635958e-06, + "loss": 0.8213, + "step": 1888 + }, + { + "epoch": 0.6921949432026383, + "grad_norm": 2.3901920318603516, + "learning_rate": 4.351579984066556e-06, + "loss": 0.7572, + "step": 1889 + }, + { + "epoch": 0.6925613777940638, + "grad_norm": 2.557530641555786, + "learning_rate": 4.342049224138162e-06, + "loss": 0.8289, + "step": 1890 + }, + { + "epoch": 0.6929278123854892, + "grad_norm": 2.30087947845459, + "learning_rate": 4.33252601757433e-06, + "loss": 0.8442, + "step": 1891 + }, + { + "epoch": 0.6932942469769147, + "grad_norm": 2.3882224559783936, + "learning_rate": 4.323010377088548e-06, + "loss": 0.7472, + "step": 1892 + }, + { + "epoch": 0.69366068156834, + "grad_norm": 2.2196640968322754, + "learning_rate": 4.313502315384195e-06, + "loss": 0.826, + "step": 1893 + }, + { + "epoch": 0.6940271161597655, + "grad_norm": 2.4228856563568115, + "learning_rate": 4.304001845154534e-06, + "loss": 0.8173, + "step": 1894 + }, + { + "epoch": 0.6943935507511909, + "grad_norm": 2.492797374725342, + "learning_rate": 4.2945089790826964e-06, + "loss": 0.8491, + "step": 1895 + }, + { + "epoch": 0.6947599853426163, + "grad_norm": 2.392589807510376, + "learning_rate": 4.285023729841653e-06, + "loss": 0.8966, + "step": 1896 + }, + { + "epoch": 0.6951264199340418, + "grad_norm": 2.366020679473877, + "learning_rate": 4.275546110094214e-06, + "loss": 0.8433, + "step": 1897 + }, + { + "epoch": 0.6954928545254672, + "grad_norm": 2.0518600940704346, + "learning_rate": 4.266076132493008e-06, + "loss": 0.7828, + "step": 1898 + }, + { + "epoch": 0.6958592891168927, + "grad_norm": 2.632521867752075, + "learning_rate": 4.256613809680446e-06, + "loss": 0.8256, + "step": 1899 + }, + { + "epoch": 0.696225723708318, + "grad_norm": 2.2324187755584717, + "learning_rate": 4.247159154288737e-06, + "loss": 0.8586, + "step": 1900 + }, + { + "epoch": 0.6965921582997435, + "grad_norm": 2.3237431049346924, + "learning_rate": 4.2377121789398415e-06, + "loss": 0.7983, + "step": 1901 + }, + { + "epoch": 0.6969585928911689, + "grad_norm": 2.55372953414917, + "learning_rate": 4.2282728962454724e-06, + "loss": 0.8691, + "step": 1902 + }, + { + "epoch": 0.6973250274825944, + "grad_norm": 2.5332720279693604, + "learning_rate": 4.218841318807075e-06, + "loss": 0.7848, + "step": 1903 + }, + { + "epoch": 0.6976914620740198, + "grad_norm": 2.3999102115631104, + "learning_rate": 4.209417459215804e-06, + "loss": 0.7941, + "step": 1904 + }, + { + "epoch": 0.6980578966654453, + "grad_norm": 2.0995166301727295, + "learning_rate": 4.2000013300525186e-06, + "loss": 0.7986, + "step": 1905 + }, + { + "epoch": 0.6984243312568706, + "grad_norm": 1.9330726861953735, + "learning_rate": 4.190592943887741e-06, + "loss": 0.8, + "step": 1906 + }, + { + "epoch": 0.698790765848296, + "grad_norm": 2.1339480876922607, + "learning_rate": 4.181192313281674e-06, + "loss": 0.8436, + "step": 1907 + }, + { + "epoch": 0.6991572004397215, + "grad_norm": 2.6887545585632324, + "learning_rate": 4.171799450784158e-06, + "loss": 0.777, + "step": 1908 + }, + { + "epoch": 0.6995236350311469, + "grad_norm": 2.4545319080352783, + "learning_rate": 4.1624143689346684e-06, + "loss": 0.7478, + "step": 1909 + }, + { + "epoch": 0.6998900696225724, + "grad_norm": 2.2089483737945557, + "learning_rate": 4.153037080262288e-06, + "loss": 0.8183, + "step": 1910 + }, + { + "epoch": 0.7002565042139978, + "grad_norm": 2.125802755355835, + "learning_rate": 4.143667597285692e-06, + "loss": 0.8096, + "step": 1911 + }, + { + "epoch": 0.7006229388054233, + "grad_norm": 2.2986371517181396, + "learning_rate": 4.1343059325131456e-06, + "loss": 0.831, + "step": 1912 + }, + { + "epoch": 0.7009893733968486, + "grad_norm": 2.3743975162506104, + "learning_rate": 4.1249520984424705e-06, + "loss": 0.8106, + "step": 1913 + }, + { + "epoch": 0.7013558079882741, + "grad_norm": 2.1469645500183105, + "learning_rate": 4.115606107561038e-06, + "loss": 0.8783, + "step": 1914 + }, + { + "epoch": 0.7017222425796995, + "grad_norm": 2.0734124183654785, + "learning_rate": 4.106267972345748e-06, + "loss": 0.8672, + "step": 1915 + }, + { + "epoch": 0.702088677171125, + "grad_norm": 2.2415788173675537, + "learning_rate": 4.096937705263004e-06, + "loss": 0.8532, + "step": 1916 + }, + { + "epoch": 0.7024551117625504, + "grad_norm": 2.446101665496826, + "learning_rate": 4.087615318768718e-06, + "loss": 0.7584, + "step": 1917 + }, + { + "epoch": 0.7028215463539759, + "grad_norm": 2.955641269683838, + "learning_rate": 4.0783008253082754e-06, + "loss": 0.7472, + "step": 1918 + }, + { + "epoch": 0.7031879809454012, + "grad_norm": 1.996250033378601, + "learning_rate": 4.068994237316527e-06, + "loss": 0.7776, + "step": 1919 + }, + { + "epoch": 0.7035544155368266, + "grad_norm": 2.1396939754486084, + "learning_rate": 4.05969556721777e-06, + "loss": 0.8349, + "step": 1920 + }, + { + "epoch": 0.7039208501282521, + "grad_norm": 2.2065043449401855, + "learning_rate": 4.050404827425726e-06, + "loss": 0.8736, + "step": 1921 + }, + { + "epoch": 0.7042872847196775, + "grad_norm": 2.3151960372924805, + "learning_rate": 4.041122030343531e-06, + "loss": 0.7911, + "step": 1922 + }, + { + "epoch": 0.704653719311103, + "grad_norm": 2.4541115760803223, + "learning_rate": 4.031847188363721e-06, + "loss": 0.8091, + "step": 1923 + }, + { + "epoch": 0.7050201539025284, + "grad_norm": 2.3490419387817383, + "learning_rate": 4.022580313868213e-06, + "loss": 0.8117, + "step": 1924 + }, + { + "epoch": 0.7053865884939539, + "grad_norm": 2.6550164222717285, + "learning_rate": 4.013321419228282e-06, + "loss": 0.8709, + "step": 1925 + }, + { + "epoch": 0.7057530230853792, + "grad_norm": 2.1409599781036377, + "learning_rate": 4.004070516804559e-06, + "loss": 0.8204, + "step": 1926 + }, + { + "epoch": 0.7061194576768047, + "grad_norm": 2.460340738296509, + "learning_rate": 3.994827618946992e-06, + "loss": 0.7722, + "step": 1927 + }, + { + "epoch": 0.7064858922682301, + "grad_norm": 2.2745752334594727, + "learning_rate": 3.985592737994851e-06, + "loss": 0.8428, + "step": 1928 + }, + { + "epoch": 0.7068523268596556, + "grad_norm": 2.2874555587768555, + "learning_rate": 3.976365886276705e-06, + "loss": 0.8038, + "step": 1929 + }, + { + "epoch": 0.707218761451081, + "grad_norm": 3.0215723514556885, + "learning_rate": 3.9671470761104e-06, + "loss": 0.8059, + "step": 1930 + }, + { + "epoch": 0.7075851960425064, + "grad_norm": 2.4965901374816895, + "learning_rate": 3.957936319803053e-06, + "loss": 0.805, + "step": 1931 + }, + { + "epoch": 0.7079516306339318, + "grad_norm": 2.127297878265381, + "learning_rate": 3.948733629651017e-06, + "loss": 0.8134, + "step": 1932 + }, + { + "epoch": 0.7083180652253572, + "grad_norm": 2.601527214050293, + "learning_rate": 3.9395390179398895e-06, + "loss": 0.7632, + "step": 1933 + }, + { + "epoch": 0.7086844998167827, + "grad_norm": 2.1665985584259033, + "learning_rate": 3.930352496944471e-06, + "loss": 0.8787, + "step": 1934 + }, + { + "epoch": 0.7090509344082081, + "grad_norm": 2.3517444133758545, + "learning_rate": 3.921174078928771e-06, + "loss": 0.8207, + "step": 1935 + }, + { + "epoch": 0.7094173689996336, + "grad_norm": 2.1779472827911377, + "learning_rate": 3.912003776145983e-06, + "loss": 0.8215, + "step": 1936 + }, + { + "epoch": 0.709783803591059, + "grad_norm": 2.2063920497894287, + "learning_rate": 3.902841600838453e-06, + "loss": 0.8076, + "step": 1937 + }, + { + "epoch": 0.7101502381824845, + "grad_norm": 2.0973567962646484, + "learning_rate": 3.8936875652376895e-06, + "loss": 0.8224, + "step": 1938 + }, + { + "epoch": 0.7105166727739098, + "grad_norm": 2.57674503326416, + "learning_rate": 3.88454168156433e-06, + "loss": 0.8262, + "step": 1939 + }, + { + "epoch": 0.7108831073653353, + "grad_norm": 2.488333225250244, + "learning_rate": 3.87540396202813e-06, + "loss": 0.8515, + "step": 1940 + }, + { + "epoch": 0.7112495419567607, + "grad_norm": 2.384260892868042, + "learning_rate": 3.86627441882795e-06, + "loss": 0.7949, + "step": 1941 + }, + { + "epoch": 0.7116159765481862, + "grad_norm": 2.0849318504333496, + "learning_rate": 3.8571530641517215e-06, + "loss": 0.8088, + "step": 1942 + }, + { + "epoch": 0.7119824111396116, + "grad_norm": 2.252814769744873, + "learning_rate": 3.8480399101764585e-06, + "loss": 0.8484, + "step": 1943 + }, + { + "epoch": 0.712348845731037, + "grad_norm": 2.517176389694214, + "learning_rate": 3.838934969068227e-06, + "loss": 0.83, + "step": 1944 + }, + { + "epoch": 0.7127152803224625, + "grad_norm": 2.161952495574951, + "learning_rate": 3.829838252982113e-06, + "loss": 0.7282, + "step": 1945 + }, + { + "epoch": 0.7130817149138878, + "grad_norm": 2.1794629096984863, + "learning_rate": 3.8207497740622426e-06, + "loss": 0.7889, + "step": 1946 + }, + { + "epoch": 0.7134481495053133, + "grad_norm": 2.5052850246429443, + "learning_rate": 3.8116695444417284e-06, + "loss": 0.8533, + "step": 1947 + }, + { + "epoch": 0.7138145840967387, + "grad_norm": 2.8699727058410645, + "learning_rate": 3.8025975762426804e-06, + "loss": 0.7708, + "step": 1948 + }, + { + "epoch": 0.7141810186881642, + "grad_norm": 2.2950704097747803, + "learning_rate": 3.7935338815761757e-06, + "loss": 0.7936, + "step": 1949 + }, + { + "epoch": 0.7145474532795896, + "grad_norm": 2.1499178409576416, + "learning_rate": 3.784478472542249e-06, + "loss": 0.8088, + "step": 1950 + }, + { + "epoch": 0.7149138878710151, + "grad_norm": 2.498230457305908, + "learning_rate": 3.775431361229871e-06, + "loss": 0.8589, + "step": 1951 + }, + { + "epoch": 0.7152803224624404, + "grad_norm": 2.3784115314483643, + "learning_rate": 3.766392559716939e-06, + "loss": 0.7791, + "step": 1952 + }, + { + "epoch": 0.7156467570538659, + "grad_norm": 2.2865967750549316, + "learning_rate": 3.757362080070247e-06, + "loss": 0.8398, + "step": 1953 + }, + { + "epoch": 0.7160131916452913, + "grad_norm": 2.2455546855926514, + "learning_rate": 3.7483399343454886e-06, + "loss": 0.7977, + "step": 1954 + }, + { + "epoch": 0.7163796262367167, + "grad_norm": 2.426880121231079, + "learning_rate": 3.7393261345872323e-06, + "loss": 0.8292, + "step": 1955 + }, + { + "epoch": 0.7167460608281422, + "grad_norm": 2.6572940349578857, + "learning_rate": 3.7303206928288948e-06, + "loss": 0.7638, + "step": 1956 + }, + { + "epoch": 0.7171124954195676, + "grad_norm": 2.630995512008667, + "learning_rate": 3.7213236210927483e-06, + "loss": 0.7029, + "step": 1957 + }, + { + "epoch": 0.717478930010993, + "grad_norm": 2.7464239597320557, + "learning_rate": 3.712334931389877e-06, + "loss": 0.7451, + "step": 1958 + }, + { + "epoch": 0.7178453646024184, + "grad_norm": 2.4412245750427246, + "learning_rate": 3.7033546357201845e-06, + "loss": 0.8073, + "step": 1959 + }, + { + "epoch": 0.7182117991938439, + "grad_norm": 2.5375096797943115, + "learning_rate": 3.6943827460723667e-06, + "loss": 0.8037, + "step": 1960 + }, + { + "epoch": 0.7185782337852693, + "grad_norm": 2.4279673099517822, + "learning_rate": 3.685419274423898e-06, + "loss": 0.8134, + "step": 1961 + }, + { + "epoch": 0.7189446683766948, + "grad_norm": 2.2107279300689697, + "learning_rate": 3.676464232741015e-06, + "loss": 0.8346, + "step": 1962 + }, + { + "epoch": 0.7193111029681202, + "grad_norm": 2.2736778259277344, + "learning_rate": 3.6675176329786933e-06, + "loss": 0.8344, + "step": 1963 + }, + { + "epoch": 0.7196775375595457, + "grad_norm": 2.9939663410186768, + "learning_rate": 3.658579487080646e-06, + "loss": 0.7851, + "step": 1964 + }, + { + "epoch": 0.720043972150971, + "grad_norm": 2.447913408279419, + "learning_rate": 3.6496498069792995e-06, + "loss": 0.7574, + "step": 1965 + }, + { + "epoch": 0.7204104067423965, + "grad_norm": 2.029849052429199, + "learning_rate": 3.6407286045957802e-06, + "loss": 0.8999, + "step": 1966 + }, + { + "epoch": 0.7207768413338219, + "grad_norm": 2.375657081604004, + "learning_rate": 3.6318158918398872e-06, + "loss": 0.8329, + "step": 1967 + }, + { + "epoch": 0.7211432759252473, + "grad_norm": 2.311725378036499, + "learning_rate": 3.6229116806100993e-06, + "loss": 0.8484, + "step": 1968 + }, + { + "epoch": 0.7215097105166728, + "grad_norm": 2.1953041553497314, + "learning_rate": 3.6140159827935316e-06, + "loss": 0.8258, + "step": 1969 + }, + { + "epoch": 0.7218761451080982, + "grad_norm": 2.110595464706421, + "learning_rate": 3.6051288102659456e-06, + "loss": 0.8319, + "step": 1970 + }, + { + "epoch": 0.7222425796995237, + "grad_norm": 2.6473276615142822, + "learning_rate": 3.5962501748917157e-06, + "loss": 0.7727, + "step": 1971 + }, + { + "epoch": 0.722609014290949, + "grad_norm": 2.762803316116333, + "learning_rate": 3.587380088523824e-06, + "loss": 0.7723, + "step": 1972 + }, + { + "epoch": 0.7229754488823745, + "grad_norm": 2.2459194660186768, + "learning_rate": 3.57851856300383e-06, + "loss": 0.8856, + "step": 1973 + }, + { + "epoch": 0.7233418834737999, + "grad_norm": 2.0015933513641357, + "learning_rate": 3.5696656101618755e-06, + "loss": 0.8034, + "step": 1974 + }, + { + "epoch": 0.7237083180652254, + "grad_norm": 2.0197722911834717, + "learning_rate": 3.56082124181665e-06, + "loss": 0.8149, + "step": 1975 + }, + { + "epoch": 0.7240747526566508, + "grad_norm": 2.4105708599090576, + "learning_rate": 3.5519854697753885e-06, + "loss": 0.856, + "step": 1976 + }, + { + "epoch": 0.7244411872480763, + "grad_norm": 2.091339111328125, + "learning_rate": 3.543158305833848e-06, + "loss": 0.7757, + "step": 1977 + }, + { + "epoch": 0.7248076218395016, + "grad_norm": 2.361039638519287, + "learning_rate": 3.5343397617762887e-06, + "loss": 0.7726, + "step": 1978 + }, + { + "epoch": 0.725174056430927, + "grad_norm": 2.58827805519104, + "learning_rate": 3.525529849375472e-06, + "loss": 0.8109, + "step": 1979 + }, + { + "epoch": 0.7255404910223525, + "grad_norm": 2.7834389209747314, + "learning_rate": 3.5167285803926252e-06, + "loss": 0.7179, + "step": 1980 + }, + { + "epoch": 0.7259069256137779, + "grad_norm": 2.9694321155548096, + "learning_rate": 3.507935966577447e-06, + "loss": 0.7565, + "step": 1981 + }, + { + "epoch": 0.7262733602052034, + "grad_norm": 2.681018114089966, + "learning_rate": 3.499152019668076e-06, + "loss": 0.7807, + "step": 1982 + }, + { + "epoch": 0.7266397947966288, + "grad_norm": 2.498192071914673, + "learning_rate": 3.490376751391086e-06, + "loss": 0.7333, + "step": 1983 + }, + { + "epoch": 0.7270062293880543, + "grad_norm": 2.3323285579681396, + "learning_rate": 3.4816101734614536e-06, + "loss": 0.7846, + "step": 1984 + }, + { + "epoch": 0.7273726639794796, + "grad_norm": 2.4998133182525635, + "learning_rate": 3.472852297582565e-06, + "loss": 0.8177, + "step": 1985 + }, + { + "epoch": 0.7277390985709051, + "grad_norm": 2.852997303009033, + "learning_rate": 3.464103135446183e-06, + "loss": 0.7623, + "step": 1986 + }, + { + "epoch": 0.7281055331623305, + "grad_norm": 2.3916666507720947, + "learning_rate": 3.455362698732441e-06, + "loss": 0.7801, + "step": 1987 + }, + { + "epoch": 0.728471967753756, + "grad_norm": 2.727652072906494, + "learning_rate": 3.4466309991098255e-06, + "loss": 0.8518, + "step": 1988 + }, + { + "epoch": 0.7288384023451814, + "grad_norm": 2.4952447414398193, + "learning_rate": 3.437908048235149e-06, + "loss": 0.7323, + "step": 1989 + }, + { + "epoch": 0.7292048369366069, + "grad_norm": 2.348653554916382, + "learning_rate": 3.429193857753559e-06, + "loss": 0.8303, + "step": 1990 + }, + { + "epoch": 0.7295712715280323, + "grad_norm": 2.3010904788970947, + "learning_rate": 3.4204884392984917e-06, + "loss": 0.8744, + "step": 1991 + }, + { + "epoch": 0.7299377061194576, + "grad_norm": 2.9672434329986572, + "learning_rate": 3.4117918044916853e-06, + "loss": 0.6981, + "step": 1992 + }, + { + "epoch": 0.7303041407108831, + "grad_norm": 2.3051466941833496, + "learning_rate": 3.4031039649431518e-06, + "loss": 0.8161, + "step": 1993 + }, + { + "epoch": 0.7306705753023085, + "grad_norm": 2.262472629547119, + "learning_rate": 3.3944249322511502e-06, + "loss": 0.8048, + "step": 1994 + }, + { + "epoch": 0.731037009893734, + "grad_norm": 2.3546090126037598, + "learning_rate": 3.385754718002193e-06, + "loss": 0.8214, + "step": 1995 + }, + { + "epoch": 0.7314034444851594, + "grad_norm": 2.283719301223755, + "learning_rate": 3.3770933337710165e-06, + "loss": 0.8198, + "step": 1996 + }, + { + "epoch": 0.7317698790765849, + "grad_norm": 2.383819818496704, + "learning_rate": 3.3684407911205697e-06, + "loss": 0.8661, + "step": 1997 + }, + { + "epoch": 0.7321363136680102, + "grad_norm": 2.4723551273345947, + "learning_rate": 3.3597971016019994e-06, + "loss": 0.8011, + "step": 1998 + }, + { + "epoch": 0.7325027482594357, + "grad_norm": 2.1126275062561035, + "learning_rate": 3.351162276754626e-06, + "loss": 0.8016, + "step": 1999 + }, + { + "epoch": 0.7328691828508611, + "grad_norm": 2.984003782272339, + "learning_rate": 3.342536328105944e-06, + "loss": 0.8045, + "step": 2000 + }, + { + "epoch": 0.7332356174422866, + "grad_norm": 2.3070881366729736, + "learning_rate": 3.3339192671715992e-06, + "loss": 0.7834, + "step": 2001 + }, + { + "epoch": 0.733602052033712, + "grad_norm": 2.393101453781128, + "learning_rate": 3.325311105455361e-06, + "loss": 0.8523, + "step": 2002 + }, + { + "epoch": 0.7339684866251374, + "grad_norm": 2.4970571994781494, + "learning_rate": 3.316711854449133e-06, + "loss": 0.7961, + "step": 2003 + }, + { + "epoch": 0.7343349212165629, + "grad_norm": 2.070354461669922, + "learning_rate": 3.308121525632908e-06, + "loss": 0.8765, + "step": 2004 + }, + { + "epoch": 0.7347013558079882, + "grad_norm": 2.0780506134033203, + "learning_rate": 3.299540130474781e-06, + "loss": 0.8653, + "step": 2005 + }, + { + "epoch": 0.7350677903994137, + "grad_norm": 2.2514584064483643, + "learning_rate": 3.290967680430912e-06, + "loss": 0.7971, + "step": 2006 + }, + { + "epoch": 0.7354342249908391, + "grad_norm": 2.65456223487854, + "learning_rate": 3.282404186945525e-06, + "loss": 0.8195, + "step": 2007 + }, + { + "epoch": 0.7358006595822646, + "grad_norm": 2.9336471557617188, + "learning_rate": 3.273849661450884e-06, + "loss": 0.7254, + "step": 2008 + }, + { + "epoch": 0.73616709417369, + "grad_norm": 2.372671604156494, + "learning_rate": 3.2653041153672782e-06, + "loss": 0.8261, + "step": 2009 + }, + { + "epoch": 0.7365335287651155, + "grad_norm": 2.256636381149292, + "learning_rate": 3.2567675601030136e-06, + "loss": 0.8386, + "step": 2010 + }, + { + "epoch": 0.7368999633565408, + "grad_norm": 2.8722164630889893, + "learning_rate": 3.248240007054392e-06, + "loss": 0.7729, + "step": 2011 + }, + { + "epoch": 0.7372663979479663, + "grad_norm": 2.3636233806610107, + "learning_rate": 3.2397214676057e-06, + "loss": 0.835, + "step": 2012 + }, + { + "epoch": 0.7376328325393917, + "grad_norm": 2.246462106704712, + "learning_rate": 3.231211953129183e-06, + "loss": 0.8031, + "step": 2013 + }, + { + "epoch": 0.7379992671308171, + "grad_norm": 2.266885757446289, + "learning_rate": 3.2227114749850476e-06, + "loss": 0.856, + "step": 2014 + }, + { + "epoch": 0.7383657017222426, + "grad_norm": 2.540126085281372, + "learning_rate": 3.214220044521428e-06, + "loss": 0.8486, + "step": 2015 + }, + { + "epoch": 0.738732136313668, + "grad_norm": 2.3920586109161377, + "learning_rate": 3.205737673074387e-06, + "loss": 0.8452, + "step": 2016 + }, + { + "epoch": 0.7390985709050935, + "grad_norm": 2.3333163261413574, + "learning_rate": 3.1972643719678896e-06, + "loss": 0.8147, + "step": 2017 + }, + { + "epoch": 0.7394650054965188, + "grad_norm": 2.166067600250244, + "learning_rate": 3.1888001525137958e-06, + "loss": 0.801, + "step": 2018 + }, + { + "epoch": 0.7398314400879443, + "grad_norm": 2.6589584350585938, + "learning_rate": 3.1803450260118397e-06, + "loss": 0.7791, + "step": 2019 + }, + { + "epoch": 0.7401978746793697, + "grad_norm": 2.284156560897827, + "learning_rate": 3.171899003749611e-06, + "loss": 0.8275, + "step": 2020 + }, + { + "epoch": 0.7405643092707952, + "grad_norm": 2.214045286178589, + "learning_rate": 3.163462097002552e-06, + "loss": 0.7909, + "step": 2021 + }, + { + "epoch": 0.7409307438622206, + "grad_norm": 2.4925358295440674, + "learning_rate": 3.1550343170339336e-06, + "loss": 0.6837, + "step": 2022 + }, + { + "epoch": 0.7412971784536461, + "grad_norm": 2.5517702102661133, + "learning_rate": 3.1466156750948464e-06, + "loss": 0.7163, + "step": 2023 + }, + { + "epoch": 0.7416636130450714, + "grad_norm": 2.5317089557647705, + "learning_rate": 3.1382061824241704e-06, + "loss": 0.8028, + "step": 2024 + }, + { + "epoch": 0.7420300476364969, + "grad_norm": 2.2435755729675293, + "learning_rate": 3.1298058502485863e-06, + "loss": 0.8738, + "step": 2025 + }, + { + "epoch": 0.7423964822279223, + "grad_norm": 2.4333622455596924, + "learning_rate": 3.1214146897825302e-06, + "loss": 0.882, + "step": 2026 + }, + { + "epoch": 0.7427629168193477, + "grad_norm": 2.3265597820281982, + "learning_rate": 3.1130327122282065e-06, + "loss": 0.82, + "step": 2027 + }, + { + "epoch": 0.7431293514107732, + "grad_norm": 1.9941530227661133, + "learning_rate": 3.1046599287755552e-06, + "loss": 0.4572, + "step": 2028 + }, + { + "epoch": 0.7434957860021986, + "grad_norm": 2.354604482650757, + "learning_rate": 3.0962963506022447e-06, + "loss": 0.7864, + "step": 2029 + }, + { + "epoch": 0.7438622205936241, + "grad_norm": 2.49794864654541, + "learning_rate": 3.0879419888736463e-06, + "loss": 0.7339, + "step": 2030 + }, + { + "epoch": 0.7442286551850494, + "grad_norm": 2.2387988567352295, + "learning_rate": 3.0795968547428377e-06, + "loss": 0.7992, + "step": 2031 + }, + { + "epoch": 0.7445950897764749, + "grad_norm": 2.4889345169067383, + "learning_rate": 3.0712609593505726e-06, + "loss": 0.8163, + "step": 2032 + }, + { + "epoch": 0.7449615243679003, + "grad_norm": 2.3034846782684326, + "learning_rate": 3.0629343138252708e-06, + "loss": 0.8405, + "step": 2033 + }, + { + "epoch": 0.7453279589593258, + "grad_norm": 2.2705564498901367, + "learning_rate": 3.0546169292830096e-06, + "loss": 0.8507, + "step": 2034 + }, + { + "epoch": 0.7456943935507512, + "grad_norm": 2.5430757999420166, + "learning_rate": 3.04630881682749e-06, + "loss": 0.8611, + "step": 2035 + }, + { + "epoch": 0.7460608281421767, + "grad_norm": 2.391397714614868, + "learning_rate": 3.0380099875500504e-06, + "loss": 0.8666, + "step": 2036 + }, + { + "epoch": 0.746427262733602, + "grad_norm": 3.0161325931549072, + "learning_rate": 3.0297204525296197e-06, + "loss": 0.7997, + "step": 2037 + }, + { + "epoch": 0.7467936973250274, + "grad_norm": 2.164350748062134, + "learning_rate": 3.021440222832732e-06, + "loss": 0.8054, + "step": 2038 + }, + { + "epoch": 0.7471601319164529, + "grad_norm": 2.2333128452301025, + "learning_rate": 3.0131693095134982e-06, + "loss": 0.8469, + "step": 2039 + }, + { + "epoch": 0.7475265665078783, + "grad_norm": 2.2303738594055176, + "learning_rate": 3.0049077236135805e-06, + "loss": 0.803, + "step": 2040 + }, + { + "epoch": 0.7478930010993038, + "grad_norm": 2.561152935028076, + "learning_rate": 2.9966554761622003e-06, + "loss": 0.7331, + "step": 2041 + }, + { + "epoch": 0.7482594356907292, + "grad_norm": 2.60172700881958, + "learning_rate": 2.9884125781761076e-06, + "loss": 0.769, + "step": 2042 + }, + { + "epoch": 0.7486258702821547, + "grad_norm": 2.1624298095703125, + "learning_rate": 2.980179040659572e-06, + "loss": 0.798, + "step": 2043 + }, + { + "epoch": 0.74899230487358, + "grad_norm": 2.607041835784912, + "learning_rate": 2.971954874604367e-06, + "loss": 0.8, + "step": 2044 + }, + { + "epoch": 0.7493587394650055, + "grad_norm": 2.153707504272461, + "learning_rate": 2.963740090989757e-06, + "loss": 0.8808, + "step": 2045 + }, + { + "epoch": 0.7497251740564309, + "grad_norm": 3.1826159954071045, + "learning_rate": 2.9555347007824742e-06, + "loss": 0.8187, + "step": 2046 + }, + { + "epoch": 0.7500916086478564, + "grad_norm": 2.341423273086548, + "learning_rate": 2.9473387149367182e-06, + "loss": 0.7878, + "step": 2047 + }, + { + "epoch": 0.7504580432392818, + "grad_norm": 2.590043067932129, + "learning_rate": 2.9391521443941263e-06, + "loss": 0.8031, + "step": 2048 + }, + { + "epoch": 0.7508244778307073, + "grad_norm": 2.0318918228149414, + "learning_rate": 2.930975000083773e-06, + "loss": 0.7952, + "step": 2049 + }, + { + "epoch": 0.7508244778307073, + "eval_loss": 0.8080984354019165, + "eval_runtime": 799.0988, + "eval_samples_per_second": 3.394, + "eval_steps_per_second": 0.424, + "step": 2049 + }, + { + "epoch": 0.7511909124221327, + "grad_norm": 2.6090524196624756, + "learning_rate": 2.9228072929221495e-06, + "loss": 0.7518, + "step": 2050 + }, + { + "epoch": 0.751557347013558, + "grad_norm": 2.003688335418701, + "learning_rate": 2.9146490338131405e-06, + "loss": 0.8436, + "step": 2051 + }, + { + "epoch": 0.7519237816049835, + "grad_norm": 2.512913465499878, + "learning_rate": 2.906500233648023e-06, + "loss": 0.8456, + "step": 2052 + }, + { + "epoch": 0.7522902161964089, + "grad_norm": 2.6392178535461426, + "learning_rate": 2.898360903305447e-06, + "loss": 0.7732, + "step": 2053 + }, + { + "epoch": 0.7526566507878344, + "grad_norm": 2.2465503215789795, + "learning_rate": 2.890231053651419e-06, + "loss": 0.8681, + "step": 2054 + }, + { + "epoch": 0.7530230853792598, + "grad_norm": 2.790282964706421, + "learning_rate": 2.882110695539291e-06, + "loss": 0.8697, + "step": 2055 + }, + { + "epoch": 0.7533895199706853, + "grad_norm": 2.172825336456299, + "learning_rate": 2.873999839809736e-06, + "loss": 0.7808, + "step": 2056 + }, + { + "epoch": 0.7537559545621106, + "grad_norm": 2.74893856048584, + "learning_rate": 2.865898497290749e-06, + "loss": 0.7675, + "step": 2057 + }, + { + "epoch": 0.7541223891535361, + "grad_norm": 2.2950291633605957, + "learning_rate": 2.8578066787976266e-06, + "loss": 0.8353, + "step": 2058 + }, + { + "epoch": 0.7544888237449615, + "grad_norm": 2.3367483615875244, + "learning_rate": 2.849724395132941e-06, + "loss": 0.8456, + "step": 2059 + }, + { + "epoch": 0.754855258336387, + "grad_norm": 2.3577630519866943, + "learning_rate": 2.841651657086547e-06, + "loss": 0.8345, + "step": 2060 + }, + { + "epoch": 0.7552216929278124, + "grad_norm": 2.4616196155548096, + "learning_rate": 2.8335884754355436e-06, + "loss": 0.752, + "step": 2061 + }, + { + "epoch": 0.7555881275192378, + "grad_norm": 2.157444953918457, + "learning_rate": 2.8255348609442835e-06, + "loss": 0.8047, + "step": 2062 + }, + { + "epoch": 0.7559545621106633, + "grad_norm": 2.123469352722168, + "learning_rate": 2.8174908243643405e-06, + "loss": 0.8698, + "step": 2063 + }, + { + "epoch": 0.7563209967020886, + "grad_norm": 2.120635986328125, + "learning_rate": 2.809456376434504e-06, + "loss": 0.815, + "step": 2064 + }, + { + "epoch": 0.7566874312935141, + "grad_norm": 2.5298824310302734, + "learning_rate": 2.801431527880767e-06, + "loss": 0.8078, + "step": 2065 + }, + { + "epoch": 0.7570538658849395, + "grad_norm": 2.6083366870880127, + "learning_rate": 2.7934162894162954e-06, + "loss": 0.7339, + "step": 2066 + }, + { + "epoch": 0.757420300476365, + "grad_norm": 2.328028678894043, + "learning_rate": 2.7854106717414354e-06, + "loss": 0.8201, + "step": 2067 + }, + { + "epoch": 0.7577867350677904, + "grad_norm": 2.296004056930542, + "learning_rate": 2.777414685543689e-06, + "loss": 0.8521, + "step": 2068 + }, + { + "epoch": 0.7581531696592159, + "grad_norm": 2.2588114738464355, + "learning_rate": 2.7694283414976995e-06, + "loss": 0.8129, + "step": 2069 + }, + { + "epoch": 0.7585196042506412, + "grad_norm": 2.3270256519317627, + "learning_rate": 2.761451650265231e-06, + "loss": 0.8647, + "step": 2070 + }, + { + "epoch": 0.7588860388420667, + "grad_norm": 2.4873387813568115, + "learning_rate": 2.753484622495173e-06, + "loss": 0.785, + "step": 2071 + }, + { + "epoch": 0.7592524734334921, + "grad_norm": 2.892294406890869, + "learning_rate": 2.745527268823501e-06, + "loss": 0.7487, + "step": 2072 + }, + { + "epoch": 0.7596189080249176, + "grad_norm": 2.315394163131714, + "learning_rate": 2.737579599873288e-06, + "loss": 0.8463, + "step": 2073 + }, + { + "epoch": 0.759985342616343, + "grad_norm": 2.721442699432373, + "learning_rate": 2.72964162625467e-06, + "loss": 0.7053, + "step": 2074 + }, + { + "epoch": 0.7603517772077684, + "grad_norm": 2.559688091278076, + "learning_rate": 2.7217133585648427e-06, + "loss": 0.8889, + "step": 2075 + }, + { + "epoch": 0.7607182117991939, + "grad_norm": 2.5306828022003174, + "learning_rate": 2.713794807388047e-06, + "loss": 0.8966, + "step": 2076 + }, + { + "epoch": 0.7610846463906192, + "grad_norm": 2.15157151222229, + "learning_rate": 2.7058859832955432e-06, + "loss": 0.8052, + "step": 2077 + }, + { + "epoch": 0.7614510809820447, + "grad_norm": 2.3579909801483154, + "learning_rate": 2.6979868968456147e-06, + "loss": 0.8203, + "step": 2078 + }, + { + "epoch": 0.7618175155734701, + "grad_norm": 2.602248430252075, + "learning_rate": 2.690097558583543e-06, + "loss": 0.7435, + "step": 2079 + }, + { + "epoch": 0.7621839501648956, + "grad_norm": 2.3024191856384277, + "learning_rate": 2.6822179790415982e-06, + "loss": 0.7994, + "step": 2080 + }, + { + "epoch": 0.762550384756321, + "grad_norm": 2.410738229751587, + "learning_rate": 2.6743481687390115e-06, + "loss": 0.8174, + "step": 2081 + }, + { + "epoch": 0.7629168193477465, + "grad_norm": 2.2351765632629395, + "learning_rate": 2.6664881381819873e-06, + "loss": 0.8043, + "step": 2082 + }, + { + "epoch": 0.7632832539391718, + "grad_norm": 2.340183973312378, + "learning_rate": 2.658637897863662e-06, + "loss": 0.8373, + "step": 2083 + }, + { + "epoch": 0.7636496885305973, + "grad_norm": 2.4249885082244873, + "learning_rate": 2.650797458264107e-06, + "loss": 0.7883, + "step": 2084 + }, + { + "epoch": 0.7640161231220227, + "grad_norm": 2.5286998748779297, + "learning_rate": 2.642966829850312e-06, + "loss": 0.8601, + "step": 2085 + }, + { + "epoch": 0.7643825577134481, + "grad_norm": 2.892470359802246, + "learning_rate": 2.6351460230761683e-06, + "loss": 0.7755, + "step": 2086 + }, + { + "epoch": 0.7647489923048736, + "grad_norm": 2.4626762866973877, + "learning_rate": 2.627335048382448e-06, + "loss": 0.7893, + "step": 2087 + }, + { + "epoch": 0.765115426896299, + "grad_norm": 2.1419596672058105, + "learning_rate": 2.6195339161968046e-06, + "loss": 0.793, + "step": 2088 + }, + { + "epoch": 0.7654818614877245, + "grad_norm": 2.3152785301208496, + "learning_rate": 2.6117426369337505e-06, + "loss": 0.8636, + "step": 2089 + }, + { + "epoch": 0.7658482960791498, + "grad_norm": 2.5008513927459717, + "learning_rate": 2.603961220994644e-06, + "loss": 0.8222, + "step": 2090 + }, + { + "epoch": 0.7662147306705753, + "grad_norm": 2.367537021636963, + "learning_rate": 2.596189678767679e-06, + "loss": 0.7907, + "step": 2091 + }, + { + "epoch": 0.7665811652620007, + "grad_norm": 2.2729032039642334, + "learning_rate": 2.5884280206278577e-06, + "loss": 0.8255, + "step": 2092 + }, + { + "epoch": 0.7669475998534262, + "grad_norm": 2.4431867599487305, + "learning_rate": 2.5806762569370014e-06, + "loss": 0.8168, + "step": 2093 + }, + { + "epoch": 0.7673140344448516, + "grad_norm": 2.766500234603882, + "learning_rate": 2.5729343980437095e-06, + "loss": 0.687, + "step": 2094 + }, + { + "epoch": 0.7676804690362771, + "grad_norm": 2.3301477432250977, + "learning_rate": 2.565202454283364e-06, + "loss": 0.8608, + "step": 2095 + }, + { + "epoch": 0.7680469036277024, + "grad_norm": 2.6721205711364746, + "learning_rate": 2.5574804359781167e-06, + "loss": 0.8265, + "step": 2096 + }, + { + "epoch": 0.7684133382191279, + "grad_norm": 2.609799861907959, + "learning_rate": 2.5497683534368543e-06, + "loss": 0.834, + "step": 2097 + }, + { + "epoch": 0.7687797728105533, + "grad_norm": 2.4966788291931152, + "learning_rate": 2.5420662169552114e-06, + "loss": 0.7786, + "step": 2098 + }, + { + "epoch": 0.7691462074019787, + "grad_norm": 2.085581064224243, + "learning_rate": 2.53437403681554e-06, + "loss": 0.8449, + "step": 2099 + }, + { + "epoch": 0.7695126419934042, + "grad_norm": 2.4107308387756348, + "learning_rate": 2.526691823286902e-06, + "loss": 0.8249, + "step": 2100 + }, + { + "epoch": 0.7698790765848296, + "grad_norm": 2.4472155570983887, + "learning_rate": 2.5190195866250545e-06, + "loss": 0.7904, + "step": 2101 + }, + { + "epoch": 0.7702455111762551, + "grad_norm": 2.776448965072632, + "learning_rate": 2.5113573370724276e-06, + "loss": 0.7221, + "step": 2102 + }, + { + "epoch": 0.7706119457676804, + "grad_norm": 2.283174753189087, + "learning_rate": 2.503705084858129e-06, + "loss": 0.8141, + "step": 2103 + }, + { + "epoch": 0.7709783803591059, + "grad_norm": 2.382694959640503, + "learning_rate": 2.496062840197918e-06, + "loss": 0.804, + "step": 2104 + }, + { + "epoch": 0.7713448149505313, + "grad_norm": 2.3865418434143066, + "learning_rate": 2.488430613294187e-06, + "loss": 0.7391, + "step": 2105 + }, + { + "epoch": 0.7717112495419568, + "grad_norm": 2.399531602859497, + "learning_rate": 2.4808084143359602e-06, + "loss": 0.7877, + "step": 2106 + }, + { + "epoch": 0.7720776841333822, + "grad_norm": 2.91961932182312, + "learning_rate": 2.4731962534988784e-06, + "loss": 0.7119, + "step": 2107 + }, + { + "epoch": 0.7724441187248077, + "grad_norm": 2.2047502994537354, + "learning_rate": 2.465594140945169e-06, + "loss": 0.8487, + "step": 2108 + }, + { + "epoch": 0.772810553316233, + "grad_norm": 3.028589963912964, + "learning_rate": 2.4580020868236575e-06, + "loss": 0.799, + "step": 2109 + }, + { + "epoch": 0.7731769879076584, + "grad_norm": 2.747431993484497, + "learning_rate": 2.450420101269735e-06, + "loss": 0.7931, + "step": 2110 + }, + { + "epoch": 0.7735434224990839, + "grad_norm": 2.3259732723236084, + "learning_rate": 2.442848194405354e-06, + "loss": 0.7772, + "step": 2111 + }, + { + "epoch": 0.7739098570905093, + "grad_norm": 2.3536782264709473, + "learning_rate": 2.435286376339012e-06, + "loss": 0.7929, + "step": 2112 + }, + { + "epoch": 0.7742762916819348, + "grad_norm": 2.4395339488983154, + "learning_rate": 2.427734657165731e-06, + "loss": 0.8416, + "step": 2113 + }, + { + "epoch": 0.7746427262733602, + "grad_norm": 2.406297445297241, + "learning_rate": 2.420193046967061e-06, + "loss": 0.811, + "step": 2114 + }, + { + "epoch": 0.7750091608647857, + "grad_norm": 2.6960110664367676, + "learning_rate": 2.412661555811052e-06, + "loss": 0.8142, + "step": 2115 + }, + { + "epoch": 0.775375595456211, + "grad_norm": 2.3994743824005127, + "learning_rate": 2.4051401937522424e-06, + "loss": 0.759, + "step": 2116 + }, + { + "epoch": 0.7757420300476365, + "grad_norm": 2.3011417388916016, + "learning_rate": 2.3976289708316538e-06, + "loss": 0.7958, + "step": 2117 + }, + { + "epoch": 0.7761084646390619, + "grad_norm": 2.311936378479004, + "learning_rate": 2.390127897076765e-06, + "loss": 0.813, + "step": 2118 + }, + { + "epoch": 0.7764748992304874, + "grad_norm": 2.259918689727783, + "learning_rate": 2.382636982501513e-06, + "loss": 0.8316, + "step": 2119 + }, + { + "epoch": 0.7768413338219128, + "grad_norm": 2.676295042037964, + "learning_rate": 2.3751562371062684e-06, + "loss": 0.82, + "step": 2120 + }, + { + "epoch": 0.7772077684133383, + "grad_norm": 2.99137020111084, + "learning_rate": 2.3676856708778274e-06, + "loss": 0.7577, + "step": 2121 + }, + { + "epoch": 0.7775742030047637, + "grad_norm": 2.3418562412261963, + "learning_rate": 2.3602252937893987e-06, + "loss": 0.8698, + "step": 2122 + }, + { + "epoch": 0.777940637596189, + "grad_norm": 2.545377492904663, + "learning_rate": 2.3527751158005797e-06, + "loss": 0.7925, + "step": 2123 + }, + { + "epoch": 0.7783070721876145, + "grad_norm": 2.7940926551818848, + "learning_rate": 2.3453351468573617e-06, + "loss": 0.7702, + "step": 2124 + }, + { + "epoch": 0.7786735067790399, + "grad_norm": 2.804811477661133, + "learning_rate": 2.3379053968921038e-06, + "loss": 0.7383, + "step": 2125 + }, + { + "epoch": 0.7790399413704654, + "grad_norm": 2.2485673427581787, + "learning_rate": 2.330485875823526e-06, + "loss": 0.7616, + "step": 2126 + }, + { + "epoch": 0.7794063759618908, + "grad_norm": 2.019737482070923, + "learning_rate": 2.3230765935566823e-06, + "loss": 0.8021, + "step": 2127 + }, + { + "epoch": 0.7797728105533163, + "grad_norm": 1.9526029825210571, + "learning_rate": 2.315677559982973e-06, + "loss": 0.8219, + "step": 2128 + }, + { + "epoch": 0.7801392451447416, + "grad_norm": 2.3496994972229004, + "learning_rate": 2.3082887849800984e-06, + "loss": 0.7756, + "step": 2129 + }, + { + "epoch": 0.7805056797361671, + "grad_norm": 2.2115635871887207, + "learning_rate": 2.3009102784120808e-06, + "loss": 0.8192, + "step": 2130 + }, + { + "epoch": 0.7808721143275925, + "grad_norm": 2.2814576625823975, + "learning_rate": 2.2935420501292238e-06, + "loss": 0.8423, + "step": 2131 + }, + { + "epoch": 0.781238548919018, + "grad_norm": 2.196955442428589, + "learning_rate": 2.286184109968117e-06, + "loss": 0.8066, + "step": 2132 + }, + { + "epoch": 0.7816049835104434, + "grad_norm": 2.391277551651001, + "learning_rate": 2.278836467751604e-06, + "loss": 0.7948, + "step": 2133 + }, + { + "epoch": 0.7819714181018688, + "grad_norm": 3.0919301509857178, + "learning_rate": 2.2714991332887916e-06, + "loss": 0.72, + "step": 2134 + }, + { + "epoch": 0.7823378526932943, + "grad_norm": 2.3225972652435303, + "learning_rate": 2.264172116375021e-06, + "loss": 0.8308, + "step": 2135 + }, + { + "epoch": 0.7827042872847196, + "grad_norm": 2.4777140617370605, + "learning_rate": 2.2568554267918596e-06, + "loss": 0.8269, + "step": 2136 + }, + { + "epoch": 0.7830707218761451, + "grad_norm": 2.737854480743408, + "learning_rate": 2.2495490743070915e-06, + "loss": 0.8036, + "step": 2137 + }, + { + "epoch": 0.7834371564675705, + "grad_norm": 2.548006534576416, + "learning_rate": 2.2422530686746936e-06, + "loss": 0.827, + "step": 2138 + }, + { + "epoch": 0.783803591058996, + "grad_norm": 2.565333604812622, + "learning_rate": 2.234967419634837e-06, + "loss": 0.8095, + "step": 2139 + }, + { + "epoch": 0.7841700256504214, + "grad_norm": 2.1336276531219482, + "learning_rate": 2.2276921369138593e-06, + "loss": 0.8294, + "step": 2140 + }, + { + "epoch": 0.7845364602418469, + "grad_norm": 2.151606321334839, + "learning_rate": 2.2204272302242658e-06, + "loss": 0.8249, + "step": 2141 + }, + { + "epoch": 0.7849028948332722, + "grad_norm": 2.794053792953491, + "learning_rate": 2.2131727092647082e-06, + "loss": 0.7268, + "step": 2142 + }, + { + "epoch": 0.7852693294246977, + "grad_norm": 2.621978998184204, + "learning_rate": 2.205928583719973e-06, + "loss": 0.7889, + "step": 2143 + }, + { + "epoch": 0.7856357640161231, + "grad_norm": 2.2475435733795166, + "learning_rate": 2.198694863260964e-06, + "loss": 0.8576, + "step": 2144 + }, + { + "epoch": 0.7860021986075486, + "grad_norm": 2.1243855953216553, + "learning_rate": 2.191471557544701e-06, + "loss": 0.7949, + "step": 2145 + }, + { + "epoch": 0.786368633198974, + "grad_norm": 2.1951136589050293, + "learning_rate": 2.184258676214298e-06, + "loss": 0.7939, + "step": 2146 + }, + { + "epoch": 0.7867350677903994, + "grad_norm": 2.225924491882324, + "learning_rate": 2.1770562288989517e-06, + "loss": 0.8111, + "step": 2147 + }, + { + "epoch": 0.7871015023818249, + "grad_norm": 2.1376214027404785, + "learning_rate": 2.169864225213931e-06, + "loss": 0.784, + "step": 2148 + }, + { + "epoch": 0.7874679369732502, + "grad_norm": 3.0692696571350098, + "learning_rate": 2.1626826747605578e-06, + "loss": 0.7657, + "step": 2149 + }, + { + "epoch": 0.7878343715646757, + "grad_norm": 2.149909257888794, + "learning_rate": 2.1555115871262055e-06, + "loss": 0.8242, + "step": 2150 + }, + { + "epoch": 0.7882008061561011, + "grad_norm": 2.780365228652954, + "learning_rate": 2.1483509718842734e-06, + "loss": 0.7929, + "step": 2151 + }, + { + "epoch": 0.7885672407475266, + "grad_norm": 2.1888341903686523, + "learning_rate": 2.141200838594184e-06, + "loss": 0.7964, + "step": 2152 + }, + { + "epoch": 0.788933675338952, + "grad_norm": 2.308290719985962, + "learning_rate": 2.1340611968013693e-06, + "loss": 0.7618, + "step": 2153 + }, + { + "epoch": 0.7893001099303775, + "grad_norm": 2.4088211059570312, + "learning_rate": 2.1269320560372463e-06, + "loss": 0.831, + "step": 2154 + }, + { + "epoch": 0.7896665445218028, + "grad_norm": 2.241452217102051, + "learning_rate": 2.1198134258192193e-06, + "loss": 0.8213, + "step": 2155 + }, + { + "epoch": 0.7900329791132283, + "grad_norm": 2.262209177017212, + "learning_rate": 2.1127053156506628e-06, + "loss": 0.8348, + "step": 2156 + }, + { + "epoch": 0.7903994137046537, + "grad_norm": 2.7259719371795654, + "learning_rate": 2.1056077350209014e-06, + "loss": 0.7151, + "step": 2157 + }, + { + "epoch": 0.7907658482960791, + "grad_norm": 2.6229419708251953, + "learning_rate": 2.0985206934052094e-06, + "loss": 0.8081, + "step": 2158 + }, + { + "epoch": 0.7911322828875046, + "grad_norm": 2.408729314804077, + "learning_rate": 2.091444200264784e-06, + "loss": 0.7861, + "step": 2159 + }, + { + "epoch": 0.79149871747893, + "grad_norm": 2.3548901081085205, + "learning_rate": 2.0843782650467437e-06, + "loss": 0.874, + "step": 2160 + }, + { + "epoch": 0.7918651520703555, + "grad_norm": 2.125981092453003, + "learning_rate": 2.0773228971841165e-06, + "loss": 0.7882, + "step": 2161 + }, + { + "epoch": 0.7922315866617808, + "grad_norm": 2.5681943893432617, + "learning_rate": 2.0702781060958133e-06, + "loss": 0.764, + "step": 2162 + }, + { + "epoch": 0.7925980212532063, + "grad_norm": 3.0221588611602783, + "learning_rate": 2.0632439011866356e-06, + "loss": 0.7845, + "step": 2163 + }, + { + "epoch": 0.7929644558446317, + "grad_norm": 2.767496109008789, + "learning_rate": 2.056220291847243e-06, + "loss": 0.8026, + "step": 2164 + }, + { + "epoch": 0.7933308904360572, + "grad_norm": 2.1552770137786865, + "learning_rate": 2.049207287454156e-06, + "loss": 0.8342, + "step": 2165 + }, + { + "epoch": 0.7936973250274826, + "grad_norm": 2.2508256435394287, + "learning_rate": 2.0422048973697362e-06, + "loss": 0.8133, + "step": 2166 + }, + { + "epoch": 0.7940637596189081, + "grad_norm": 2.6234443187713623, + "learning_rate": 2.035213130942175e-06, + "loss": 0.8476, + "step": 2167 + }, + { + "epoch": 0.7944301942103335, + "grad_norm": 2.3684444427490234, + "learning_rate": 2.02823199750548e-06, + "loss": 0.8084, + "step": 2168 + }, + { + "epoch": 0.7947966288017588, + "grad_norm": 2.1743626594543457, + "learning_rate": 2.021261506379468e-06, + "loss": 0.8631, + "step": 2169 + }, + { + "epoch": 0.7951630633931843, + "grad_norm": 2.5163462162017822, + "learning_rate": 2.01430166686974e-06, + "loss": 0.7819, + "step": 2170 + }, + { + "epoch": 0.7955294979846097, + "grad_norm": 2.3898160457611084, + "learning_rate": 2.007352488267684e-06, + "loss": 0.7564, + "step": 2171 + }, + { + "epoch": 0.7958959325760352, + "grad_norm": 2.44692063331604, + "learning_rate": 2.000413979850456e-06, + "loss": 0.8454, + "step": 2172 + }, + { + "epoch": 0.7962623671674606, + "grad_norm": 2.379611015319824, + "learning_rate": 1.993486150880962e-06, + "loss": 0.8405, + "step": 2173 + }, + { + "epoch": 0.7966288017588861, + "grad_norm": 2.6301681995391846, + "learning_rate": 1.9865690106078573e-06, + "loss": 0.7633, + "step": 2174 + }, + { + "epoch": 0.7969952363503114, + "grad_norm": 3.5861284732818604, + "learning_rate": 1.9796625682655203e-06, + "loss": 0.7662, + "step": 2175 + }, + { + "epoch": 0.7973616709417369, + "grad_norm": 2.236845016479492, + "learning_rate": 1.9727668330740525e-06, + "loss": 0.8066, + "step": 2176 + }, + { + "epoch": 0.7977281055331623, + "grad_norm": 2.5206212997436523, + "learning_rate": 1.965881814239263e-06, + "loss": 0.8183, + "step": 2177 + }, + { + "epoch": 0.7980945401245878, + "grad_norm": 2.225001811981201, + "learning_rate": 1.959007520952653e-06, + "loss": 0.8193, + "step": 2178 + }, + { + "epoch": 0.7984609747160132, + "grad_norm": 2.5378408432006836, + "learning_rate": 1.952143962391403e-06, + "loss": 0.7865, + "step": 2179 + }, + { + "epoch": 0.7988274093074387, + "grad_norm": 2.249823570251465, + "learning_rate": 1.9452911477183623e-06, + "loss": 0.7653, + "step": 2180 + }, + { + "epoch": 0.799193843898864, + "grad_norm": 2.4021379947662354, + "learning_rate": 1.9384490860820414e-06, + "loss": 0.8034, + "step": 2181 + }, + { + "epoch": 0.7995602784902894, + "grad_norm": 2.3179380893707275, + "learning_rate": 1.9316177866165906e-06, + "loss": 0.7761, + "step": 2182 + }, + { + "epoch": 0.7999267130817149, + "grad_norm": 2.412019729614258, + "learning_rate": 1.9247972584417973e-06, + "loss": 0.8295, + "step": 2183 + }, + { + "epoch": 0.8002931476731403, + "grad_norm": 2.0360944271087646, + "learning_rate": 1.9179875106630687e-06, + "loss": 0.8288, + "step": 2184 + }, + { + "epoch": 0.8006595822645658, + "grad_norm": 2.4399008750915527, + "learning_rate": 1.9111885523714146e-06, + "loss": 0.8674, + "step": 2185 + }, + { + "epoch": 0.8010260168559912, + "grad_norm": 2.487194299697876, + "learning_rate": 1.904400392643444e-06, + "loss": 0.7448, + "step": 2186 + }, + { + "epoch": 0.8013924514474167, + "grad_norm": 2.4780404567718506, + "learning_rate": 1.897623040541352e-06, + "loss": 0.838, + "step": 2187 + }, + { + "epoch": 0.801758886038842, + "grad_norm": 2.225506544113159, + "learning_rate": 1.8908565051129047e-06, + "loss": 0.8501, + "step": 2188 + }, + { + "epoch": 0.8021253206302675, + "grad_norm": 2.3592822551727295, + "learning_rate": 1.8841007953914293e-06, + "loss": 0.7955, + "step": 2189 + }, + { + "epoch": 0.8024917552216929, + "grad_norm": 2.3249824047088623, + "learning_rate": 1.8773559203957937e-06, + "loss": 0.8009, + "step": 2190 + }, + { + "epoch": 0.8028581898131184, + "grad_norm": 2.304548740386963, + "learning_rate": 1.8706218891304084e-06, + "loss": 0.8716, + "step": 2191 + }, + { + "epoch": 0.8032246244045438, + "grad_norm": 2.510841131210327, + "learning_rate": 1.8638987105852068e-06, + "loss": 0.8073, + "step": 2192 + }, + { + "epoch": 0.8035910589959692, + "grad_norm": 2.4938032627105713, + "learning_rate": 1.8571863937356304e-06, + "loss": 0.8994, + "step": 2193 + }, + { + "epoch": 0.8039574935873947, + "grad_norm": 2.564622163772583, + "learning_rate": 1.8504849475426278e-06, + "loss": 0.7947, + "step": 2194 + }, + { + "epoch": 0.80432392817882, + "grad_norm": 3.071754217147827, + "learning_rate": 1.8437943809526226e-06, + "loss": 0.735, + "step": 2195 + }, + { + "epoch": 0.8046903627702455, + "grad_norm": 2.277132272720337, + "learning_rate": 1.8371147028975266e-06, + "loss": 0.8293, + "step": 2196 + }, + { + "epoch": 0.8050567973616709, + "grad_norm": 2.702019214630127, + "learning_rate": 1.830445922294708e-06, + "loss": 0.7472, + "step": 2197 + }, + { + "epoch": 0.8054232319530964, + "grad_norm": 2.243504047393799, + "learning_rate": 1.8237880480469884e-06, + "loss": 0.8351, + "step": 2198 + }, + { + "epoch": 0.8057896665445218, + "grad_norm": 2.38909649848938, + "learning_rate": 1.8171410890426322e-06, + "loss": 0.8351, + "step": 2199 + }, + { + "epoch": 0.8061561011359473, + "grad_norm": 2.293527126312256, + "learning_rate": 1.8105050541553327e-06, + "loss": 0.7708, + "step": 2200 + }, + { + "epoch": 0.8065225357273726, + "grad_norm": 2.17563796043396, + "learning_rate": 1.8038799522441896e-06, + "loss": 0.7982, + "step": 2201 + }, + { + "epoch": 0.8068889703187981, + "grad_norm": 2.4225881099700928, + "learning_rate": 1.7972657921537195e-06, + "loss": 0.7971, + "step": 2202 + }, + { + "epoch": 0.8072554049102235, + "grad_norm": 2.258533477783203, + "learning_rate": 1.7906625827138259e-06, + "loss": 0.816, + "step": 2203 + }, + { + "epoch": 0.807621839501649, + "grad_norm": 2.410449504852295, + "learning_rate": 1.7840703327397913e-06, + "loss": 0.8089, + "step": 2204 + }, + { + "epoch": 0.8079882740930744, + "grad_norm": 2.261918544769287, + "learning_rate": 1.777489051032275e-06, + "loss": 0.7624, + "step": 2205 + }, + { + "epoch": 0.8083547086844998, + "grad_norm": 2.631765365600586, + "learning_rate": 1.7709187463772825e-06, + "loss": 0.7966, + "step": 2206 + }, + { + "epoch": 0.8087211432759253, + "grad_norm": 2.48982834815979, + "learning_rate": 1.7643594275461763e-06, + "loss": 0.8113, + "step": 2207 + }, + { + "epoch": 0.8090875778673506, + "grad_norm": 2.338019847869873, + "learning_rate": 1.7578111032956413e-06, + "loss": 0.8178, + "step": 2208 + }, + { + "epoch": 0.8094540124587761, + "grad_norm": 2.7235567569732666, + "learning_rate": 1.7512737823676939e-06, + "loss": 0.7882, + "step": 2209 + }, + { + "epoch": 0.8098204470502015, + "grad_norm": 2.395200729370117, + "learning_rate": 1.7447474734896608e-06, + "loss": 0.8164, + "step": 2210 + }, + { + "epoch": 0.810186881641627, + "grad_norm": 2.603877305984497, + "learning_rate": 1.7382321853741579e-06, + "loss": 0.7299, + "step": 2211 + }, + { + "epoch": 0.8105533162330524, + "grad_norm": 2.3217122554779053, + "learning_rate": 1.7317279267191e-06, + "loss": 0.7892, + "step": 2212 + }, + { + "epoch": 0.8109197508244779, + "grad_norm": 2.780245780944824, + "learning_rate": 1.7252347062076712e-06, + "loss": 0.8099, + "step": 2213 + }, + { + "epoch": 0.8112861854159032, + "grad_norm": 2.129652976989746, + "learning_rate": 1.718752532508321e-06, + "loss": 0.7873, + "step": 2214 + }, + { + "epoch": 0.8116526200073287, + "grad_norm": 2.375014305114746, + "learning_rate": 1.7122814142747557e-06, + "loss": 0.7613, + "step": 2215 + }, + { + "epoch": 0.8120190545987541, + "grad_norm": 2.627427577972412, + "learning_rate": 1.7058213601459116e-06, + "loss": 0.7451, + "step": 2216 + }, + { + "epoch": 0.8123854891901795, + "grad_norm": 2.5515501499176025, + "learning_rate": 1.699372378745966e-06, + "loss": 0.8093, + "step": 2217 + }, + { + "epoch": 0.812751923781605, + "grad_norm": 2.86523699760437, + "learning_rate": 1.6929344786843072e-06, + "loss": 0.8233, + "step": 2218 + }, + { + "epoch": 0.8131183583730304, + "grad_norm": 2.387831687927246, + "learning_rate": 1.6865076685555348e-06, + "loss": 0.8221, + "step": 2219 + }, + { + "epoch": 0.8134847929644559, + "grad_norm": 2.3213229179382324, + "learning_rate": 1.6800919569394403e-06, + "loss": 0.8073, + "step": 2220 + }, + { + "epoch": 0.8138512275558812, + "grad_norm": 2.4749796390533447, + "learning_rate": 1.6736873524009945e-06, + "loss": 0.8235, + "step": 2221 + }, + { + "epoch": 0.8142176621473067, + "grad_norm": 3.385014057159424, + "learning_rate": 1.667293863490347e-06, + "loss": 0.7252, + "step": 2222 + }, + { + "epoch": 0.8145840967387321, + "grad_norm": 2.663835287094116, + "learning_rate": 1.6609114987428077e-06, + "loss": 0.7411, + "step": 2223 + }, + { + "epoch": 0.8149505313301576, + "grad_norm": 2.109029769897461, + "learning_rate": 1.6545402666788312e-06, + "loss": 0.8714, + "step": 2224 + }, + { + "epoch": 0.815316965921583, + "grad_norm": 2.6866328716278076, + "learning_rate": 1.648180175804016e-06, + "loss": 0.7418, + "step": 2225 + }, + { + "epoch": 0.8156834005130085, + "grad_norm": 2.237931966781616, + "learning_rate": 1.6418312346090837e-06, + "loss": 0.7964, + "step": 2226 + }, + { + "epoch": 0.8160498351044339, + "grad_norm": 2.364955425262451, + "learning_rate": 1.635493451569866e-06, + "loss": 0.8698, + "step": 2227 + }, + { + "epoch": 0.8164162696958593, + "grad_norm": 2.356473684310913, + "learning_rate": 1.6291668351473089e-06, + "loss": 0.8051, + "step": 2228 + }, + { + "epoch": 0.8167827042872847, + "grad_norm": 2.1675374507904053, + "learning_rate": 1.6228513937874424e-06, + "loss": 0.8232, + "step": 2229 + }, + { + "epoch": 0.8171491388787101, + "grad_norm": 2.407489538192749, + "learning_rate": 1.6165471359213847e-06, + "loss": 0.8406, + "step": 2230 + }, + { + "epoch": 0.8175155734701356, + "grad_norm": 2.22987699508667, + "learning_rate": 1.610254069965318e-06, + "loss": 0.8384, + "step": 2231 + }, + { + "epoch": 0.817882008061561, + "grad_norm": 2.386528491973877, + "learning_rate": 1.6039722043204832e-06, + "loss": 0.8679, + "step": 2232 + }, + { + "epoch": 0.8182484426529865, + "grad_norm": 3.053328037261963, + "learning_rate": 1.5977015473731739e-06, + "loss": 0.7288, + "step": 2233 + }, + { + "epoch": 0.8186148772444118, + "grad_norm": 2.4391896724700928, + "learning_rate": 1.5914421074947173e-06, + "loss": 0.8059, + "step": 2234 + }, + { + "epoch": 0.8189813118358373, + "grad_norm": 2.8696322441101074, + "learning_rate": 1.5851938930414657e-06, + "loss": 0.7281, + "step": 2235 + }, + { + "epoch": 0.8193477464272627, + "grad_norm": 2.360123872756958, + "learning_rate": 1.5789569123547876e-06, + "loss": 0.7647, + "step": 2236 + }, + { + "epoch": 0.8197141810186882, + "grad_norm": 2.4133071899414062, + "learning_rate": 1.5727311737610495e-06, + "loss": 0.8045, + "step": 2237 + }, + { + "epoch": 0.8200806156101136, + "grad_norm": 2.3202567100524902, + "learning_rate": 1.566516685571613e-06, + "loss": 0.806, + "step": 2238 + }, + { + "epoch": 0.8204470502015391, + "grad_norm": 2.883397340774536, + "learning_rate": 1.5603134560828227e-06, + "loss": 0.7688, + "step": 2239 + }, + { + "epoch": 0.8208134847929645, + "grad_norm": 2.4643895626068115, + "learning_rate": 1.5541214935759875e-06, + "loss": 0.8164, + "step": 2240 + }, + { + "epoch": 0.8211799193843898, + "grad_norm": 2.317261219024658, + "learning_rate": 1.5479408063173818e-06, + "loss": 0.8055, + "step": 2241 + }, + { + "epoch": 0.8215463539758153, + "grad_norm": 2.808366060256958, + "learning_rate": 1.5417714025582197e-06, + "loss": 0.7559, + "step": 2242 + }, + { + "epoch": 0.8219127885672407, + "grad_norm": 2.359262704849243, + "learning_rate": 1.5356132905346543e-06, + "loss": 0.8299, + "step": 2243 + }, + { + "epoch": 0.8222792231586662, + "grad_norm": 2.450434446334839, + "learning_rate": 1.529466478467768e-06, + "loss": 0.831, + "step": 2244 + }, + { + "epoch": 0.8226456577500916, + "grad_norm": 2.4397132396698, + "learning_rate": 1.523330974563554e-06, + "loss": 0.797, + "step": 2245 + }, + { + "epoch": 0.8230120923415171, + "grad_norm": 2.272751808166504, + "learning_rate": 1.5172067870129137e-06, + "loss": 0.8234, + "step": 2246 + }, + { + "epoch": 0.8233785269329424, + "grad_norm": 2.673409938812256, + "learning_rate": 1.5110939239916334e-06, + "loss": 0.8879, + "step": 2247 + }, + { + "epoch": 0.8237449615243679, + "grad_norm": 2.298658847808838, + "learning_rate": 1.504992393660386e-06, + "loss": 0.766, + "step": 2248 + }, + { + "epoch": 0.8241113961157933, + "grad_norm": 2.376124858856201, + "learning_rate": 1.4989022041647171e-06, + "loss": 0.8162, + "step": 2249 + }, + { + "epoch": 0.8244778307072188, + "grad_norm": 2.5253942012786865, + "learning_rate": 1.4928233636350287e-06, + "loss": 0.8094, + "step": 2250 + }, + { + "epoch": 0.8248442652986442, + "grad_norm": 2.904883623123169, + "learning_rate": 1.4867558801865744e-06, + "loss": 0.8148, + "step": 2251 + }, + { + "epoch": 0.8252106998900697, + "grad_norm": 2.704359531402588, + "learning_rate": 1.480699761919443e-06, + "loss": 0.8203, + "step": 2252 + }, + { + "epoch": 0.8255771344814951, + "grad_norm": 2.621049642562866, + "learning_rate": 1.4746550169185536e-06, + "loss": 0.8155, + "step": 2253 + }, + { + "epoch": 0.8259435690729204, + "grad_norm": 2.492042303085327, + "learning_rate": 1.4686216532536435e-06, + "loss": 0.8047, + "step": 2254 + }, + { + "epoch": 0.8263100036643459, + "grad_norm": 2.2928173542022705, + "learning_rate": 1.4625996789792475e-06, + "loss": 0.865, + "step": 2255 + }, + { + "epoch": 0.8266764382557713, + "grad_norm": 2.7155025005340576, + "learning_rate": 1.4565891021347066e-06, + "loss": 0.7329, + "step": 2256 + }, + { + "epoch": 0.8270428728471968, + "grad_norm": 2.393474578857422, + "learning_rate": 1.4505899307441418e-06, + "loss": 0.8134, + "step": 2257 + }, + { + "epoch": 0.8274093074386222, + "grad_norm": 2.2670514583587646, + "learning_rate": 1.4446021728164417e-06, + "loss": 0.8262, + "step": 2258 + }, + { + "epoch": 0.8277757420300477, + "grad_norm": 2.44132924079895, + "learning_rate": 1.4386258363452676e-06, + "loss": 0.8182, + "step": 2259 + }, + { + "epoch": 0.828142176621473, + "grad_norm": 2.589374542236328, + "learning_rate": 1.4326609293090289e-06, + "loss": 0.7247, + "step": 2260 + }, + { + "epoch": 0.8285086112128985, + "grad_norm": 2.5723989009857178, + "learning_rate": 1.4267074596708752e-06, + "loss": 0.8103, + "step": 2261 + }, + { + "epoch": 0.8288750458043239, + "grad_norm": 2.5136826038360596, + "learning_rate": 1.4207654353786926e-06, + "loss": 0.7336, + "step": 2262 + }, + { + "epoch": 0.8292414803957494, + "grad_norm": 2.8288135528564453, + "learning_rate": 1.4148348643650778e-06, + "loss": 0.7746, + "step": 2263 + }, + { + "epoch": 0.8296079149871748, + "grad_norm": 2.1983284950256348, + "learning_rate": 1.4089157545473476e-06, + "loss": 0.8172, + "step": 2264 + }, + { + "epoch": 0.8299743495786002, + "grad_norm": 2.543551445007324, + "learning_rate": 1.4030081138275142e-06, + "loss": 0.8106, + "step": 2265 + }, + { + "epoch": 0.8303407841700257, + "grad_norm": 2.4806594848632812, + "learning_rate": 1.3971119500922748e-06, + "loss": 0.7666, + "step": 2266 + }, + { + "epoch": 0.830707218761451, + "grad_norm": 2.7103400230407715, + "learning_rate": 1.3912272712130138e-06, + "loss": 0.8218, + "step": 2267 + }, + { + "epoch": 0.8310736533528765, + "grad_norm": 2.2749123573303223, + "learning_rate": 1.3853540850457704e-06, + "loss": 0.834, + "step": 2268 + }, + { + "epoch": 0.8314400879443019, + "grad_norm": 2.777505397796631, + "learning_rate": 1.3794923994312536e-06, + "loss": 0.8265, + "step": 2269 + }, + { + "epoch": 0.8318065225357274, + "grad_norm": 2.378373622894287, + "learning_rate": 1.373642222194812e-06, + "loss": 0.8237, + "step": 2270 + }, + { + "epoch": 0.8321729571271528, + "grad_norm": 2.2618765830993652, + "learning_rate": 1.3678035611464335e-06, + "loss": 0.8411, + "step": 2271 + }, + { + "epoch": 0.8325393917185783, + "grad_norm": 2.407137632369995, + "learning_rate": 1.3619764240807331e-06, + "loss": 0.8035, + "step": 2272 + }, + { + "epoch": 0.8329058263100036, + "grad_norm": 2.2529635429382324, + "learning_rate": 1.3561608187769348e-06, + "loss": 0.8084, + "step": 2273 + }, + { + "epoch": 0.8332722609014291, + "grad_norm": 2.5161614418029785, + "learning_rate": 1.3503567529988725e-06, + "loss": 0.7589, + "step": 2274 + }, + { + "epoch": 0.8336386954928545, + "grad_norm": 2.4014391899108887, + "learning_rate": 1.3445642344949771e-06, + "loss": 0.7823, + "step": 2275 + }, + { + "epoch": 0.83400513008428, + "grad_norm": 3.0440123081207275, + "learning_rate": 1.3387832709982606e-06, + "loss": 0.6788, + "step": 2276 + }, + { + "epoch": 0.8343715646757054, + "grad_norm": 2.168713331222534, + "learning_rate": 1.3330138702263095e-06, + "loss": 0.8395, + "step": 2277 + }, + { + "epoch": 0.8347379992671308, + "grad_norm": 2.9199330806732178, + "learning_rate": 1.3272560398812705e-06, + "loss": 0.7725, + "step": 2278 + }, + { + "epoch": 0.8351044338585563, + "grad_norm": 2.082176923751831, + "learning_rate": 1.3215097876498507e-06, + "loss": 0.8181, + "step": 2279 + }, + { + "epoch": 0.8354708684499816, + "grad_norm": 2.3910794258117676, + "learning_rate": 1.3157751212032953e-06, + "loss": 0.8273, + "step": 2280 + }, + { + "epoch": 0.8358373030414071, + "grad_norm": 2.448989152908325, + "learning_rate": 1.3100520481973867e-06, + "loss": 0.829, + "step": 2281 + }, + { + "epoch": 0.8362037376328325, + "grad_norm": 2.466848373413086, + "learning_rate": 1.3043405762724271e-06, + "loss": 0.8387, + "step": 2282 + }, + { + "epoch": 0.836570172224258, + "grad_norm": 2.8775925636291504, + "learning_rate": 1.2986407130532309e-06, + "loss": 0.7192, + "step": 2283 + }, + { + "epoch": 0.8369366068156834, + "grad_norm": 2.5638585090637207, + "learning_rate": 1.2929524661491143e-06, + "loss": 0.7766, + "step": 2284 + }, + { + "epoch": 0.8373030414071089, + "grad_norm": 2.6455864906311035, + "learning_rate": 1.28727584315389e-06, + "loss": 0.7771, + "step": 2285 + }, + { + "epoch": 0.8376694759985343, + "grad_norm": 2.722618341445923, + "learning_rate": 1.281610851645848e-06, + "loss": 0.79, + "step": 2286 + }, + { + "epoch": 0.8380359105899597, + "grad_norm": 2.9081006050109863, + "learning_rate": 1.2759574991877565e-06, + "loss": 0.7423, + "step": 2287 + }, + { + "epoch": 0.8384023451813851, + "grad_norm": 2.7987332344055176, + "learning_rate": 1.270315793326835e-06, + "loss": 0.7768, + "step": 2288 + }, + { + "epoch": 0.8387687797728105, + "grad_norm": 2.355983257293701, + "learning_rate": 1.2646857415947656e-06, + "loss": 0.8366, + "step": 2289 + }, + { + "epoch": 0.839135214364236, + "grad_norm": 2.514294147491455, + "learning_rate": 1.2590673515076645e-06, + "loss": 0.7984, + "step": 2290 + }, + { + "epoch": 0.8395016489556614, + "grad_norm": 3.055561065673828, + "learning_rate": 1.2534606305660847e-06, + "loss": 0.7668, + "step": 2291 + }, + { + "epoch": 0.8398680835470869, + "grad_norm": 2.742436647415161, + "learning_rate": 1.2478655862549993e-06, + "loss": 0.7446, + "step": 2292 + }, + { + "epoch": 0.8402345181385122, + "grad_norm": 2.348378896713257, + "learning_rate": 1.2422822260437927e-06, + "loss": 0.8166, + "step": 2293 + }, + { + "epoch": 0.8406009527299377, + "grad_norm": 2.8558847904205322, + "learning_rate": 1.2367105573862492e-06, + "loss": 0.804, + "step": 2294 + }, + { + "epoch": 0.8409673873213631, + "grad_norm": 2.6892967224121094, + "learning_rate": 1.2311505877205476e-06, + "loss": 0.7358, + "step": 2295 + }, + { + "epoch": 0.8413338219127886, + "grad_norm": 2.377847194671631, + "learning_rate": 1.2256023244692473e-06, + "loss": 0.8626, + "step": 2296 + }, + { + "epoch": 0.841700256504214, + "grad_norm": 2.764005422592163, + "learning_rate": 1.220065775039282e-06, + "loss": 0.7776, + "step": 2297 + }, + { + "epoch": 0.8420666910956395, + "grad_norm": 2.4356181621551514, + "learning_rate": 1.214540946821945e-06, + "loss": 0.7624, + "step": 2298 + }, + { + "epoch": 0.8424331256870649, + "grad_norm": 2.727997303009033, + "learning_rate": 1.2090278471928807e-06, + "loss": 0.8431, + "step": 2299 + }, + { + "epoch": 0.8427995602784902, + "grad_norm": 2.2275822162628174, + "learning_rate": 1.2035264835120787e-06, + "loss": 0.825, + "step": 2300 + }, + { + "epoch": 0.8431659948699157, + "grad_norm": 2.6020660400390625, + "learning_rate": 1.1980368631238582e-06, + "loss": 0.7773, + "step": 2301 + }, + { + "epoch": 0.8435324294613411, + "grad_norm": 2.4587666988372803, + "learning_rate": 1.1925589933568626e-06, + "loss": 0.8367, + "step": 2302 + }, + { + "epoch": 0.8438988640527666, + "grad_norm": 2.292140245437622, + "learning_rate": 1.1870928815240522e-06, + "loss": 0.7917, + "step": 2303 + }, + { + "epoch": 0.844265298644192, + "grad_norm": 2.52288556098938, + "learning_rate": 1.1816385349226822e-06, + "loss": 0.8458, + "step": 2304 + }, + { + "epoch": 0.8446317332356175, + "grad_norm": 2.344015121459961, + "learning_rate": 1.1761959608343065e-06, + "loss": 0.8382, + "step": 2305 + }, + { + "epoch": 0.8449981678270428, + "grad_norm": 2.7470791339874268, + "learning_rate": 1.170765166524762e-06, + "loss": 0.6318, + "step": 2306 + }, + { + "epoch": 0.8453646024184683, + "grad_norm": 2.4314420223236084, + "learning_rate": 1.1653461592441606e-06, + "loss": 0.8237, + "step": 2307 + }, + { + "epoch": 0.8457310370098937, + "grad_norm": 2.8293542861938477, + "learning_rate": 1.1599389462268783e-06, + "loss": 0.8543, + "step": 2308 + }, + { + "epoch": 0.8460974716013192, + "grad_norm": 2.1980388164520264, + "learning_rate": 1.1545435346915413e-06, + "loss": 0.8001, + "step": 2309 + }, + { + "epoch": 0.8464639061927446, + "grad_norm": 2.2609615325927734, + "learning_rate": 1.1491599318410251e-06, + "loss": 0.8064, + "step": 2310 + }, + { + "epoch": 0.8468303407841701, + "grad_norm": 2.2475104331970215, + "learning_rate": 1.1437881448624432e-06, + "loss": 0.8738, + "step": 2311 + }, + { + "epoch": 0.8471967753755955, + "grad_norm": 2.4149606227874756, + "learning_rate": 1.1384281809271268e-06, + "loss": 0.8969, + "step": 2312 + }, + { + "epoch": 0.8475632099670208, + "grad_norm": 2.826284170150757, + "learning_rate": 1.1330800471906323e-06, + "loss": 0.7542, + "step": 2313 + }, + { + "epoch": 0.8479296445584463, + "grad_norm": 2.559403419494629, + "learning_rate": 1.1277437507927137e-06, + "loss": 0.7561, + "step": 2314 + }, + { + "epoch": 0.8482960791498717, + "grad_norm": 2.6555681228637695, + "learning_rate": 1.122419298857329e-06, + "loss": 0.82, + "step": 2315 + }, + { + "epoch": 0.8486625137412972, + "grad_norm": 2.4660537242889404, + "learning_rate": 1.1171066984926238e-06, + "loss": 0.8118, + "step": 2316 + }, + { + "epoch": 0.8490289483327226, + "grad_norm": 2.449301242828369, + "learning_rate": 1.1118059567909178e-06, + "loss": 0.8639, + "step": 2317 + }, + { + "epoch": 0.8493953829241481, + "grad_norm": 2.502134084701538, + "learning_rate": 1.1065170808287017e-06, + "loss": 0.8315, + "step": 2318 + }, + { + "epoch": 0.8497618175155734, + "grad_norm": 2.407904624938965, + "learning_rate": 1.1012400776666276e-06, + "loss": 0.8133, + "step": 2319 + }, + { + "epoch": 0.8501282521069989, + "grad_norm": 2.5190916061401367, + "learning_rate": 1.0959749543494902e-06, + "loss": 0.8109, + "step": 2320 + }, + { + "epoch": 0.8504946866984243, + "grad_norm": 2.5884244441986084, + "learning_rate": 1.0907217179062335e-06, + "loss": 0.7594, + "step": 2321 + }, + { + "epoch": 0.8508611212898498, + "grad_norm": 2.1998255252838135, + "learning_rate": 1.0854803753499276e-06, + "loss": 0.7992, + "step": 2322 + }, + { + "epoch": 0.8512275558812752, + "grad_norm": 2.3266897201538086, + "learning_rate": 1.0802509336777634e-06, + "loss": 0.7921, + "step": 2323 + }, + { + "epoch": 0.8515939904727006, + "grad_norm": 2.42293381690979, + "learning_rate": 1.0750333998710493e-06, + "loss": 0.8439, + "step": 2324 + }, + { + "epoch": 0.8519604250641261, + "grad_norm": 2.2549338340759277, + "learning_rate": 1.0698277808951873e-06, + "loss": 0.8329, + "step": 2325 + }, + { + "epoch": 0.8523268596555514, + "grad_norm": 2.6438047885894775, + "learning_rate": 1.0646340836996837e-06, + "loss": 0.7681, + "step": 2326 + }, + { + "epoch": 0.8526932942469769, + "grad_norm": 2.733704090118408, + "learning_rate": 1.0594523152181223e-06, + "loss": 0.8068, + "step": 2327 + }, + { + "epoch": 0.8530597288384023, + "grad_norm": 2.45572829246521, + "learning_rate": 1.0542824823681663e-06, + "loss": 0.8057, + "step": 2328 + }, + { + "epoch": 0.8534261634298278, + "grad_norm": 2.4164037704467773, + "learning_rate": 1.0491245920515435e-06, + "loss": 0.7422, + "step": 2329 + }, + { + "epoch": 0.8537925980212532, + "grad_norm": 2.500162124633789, + "learning_rate": 1.0439786511540328e-06, + "loss": 0.8651, + "step": 2330 + }, + { + "epoch": 0.8541590326126787, + "grad_norm": 2.2892117500305176, + "learning_rate": 1.0388446665454699e-06, + "loss": 0.749, + "step": 2331 + }, + { + "epoch": 0.854525467204104, + "grad_norm": 2.167318820953369, + "learning_rate": 1.0337226450797234e-06, + "loss": 0.838, + "step": 2332 + }, + { + "epoch": 0.8548919017955295, + "grad_norm": 2.1648292541503906, + "learning_rate": 1.0286125935946934e-06, + "loss": 0.7958, + "step": 2333 + }, + { + "epoch": 0.8552583363869549, + "grad_norm": 2.304607629776001, + "learning_rate": 1.0235145189122964e-06, + "loss": 0.7698, + "step": 2334 + }, + { + "epoch": 0.8556247709783804, + "grad_norm": 2.7921359539031982, + "learning_rate": 1.018428427838465e-06, + "loss": 0.8214, + "step": 2335 + }, + { + "epoch": 0.8559912055698058, + "grad_norm": 2.3064191341400146, + "learning_rate": 1.0133543271631275e-06, + "loss": 0.8311, + "step": 2336 + }, + { + "epoch": 0.8563576401612312, + "grad_norm": 2.405390977859497, + "learning_rate": 1.0082922236602112e-06, + "loss": 0.7864, + "step": 2337 + }, + { + "epoch": 0.8567240747526567, + "grad_norm": 2.9421746730804443, + "learning_rate": 1.0032421240876233e-06, + "loss": 0.7838, + "step": 2338 + }, + { + "epoch": 0.857090509344082, + "grad_norm": 2.6204707622528076, + "learning_rate": 9.982040351872501e-07, + "loss": 0.7925, + "step": 2339 + }, + { + "epoch": 0.8574569439355075, + "grad_norm": 2.2351484298706055, + "learning_rate": 9.93177963684937e-07, + "loss": 0.8206, + "step": 2340 + }, + { + "epoch": 0.8578233785269329, + "grad_norm": 2.7413156032562256, + "learning_rate": 9.881639162904898e-07, + "loss": 0.8392, + "step": 2341 + }, + { + "epoch": 0.8581898131183584, + "grad_norm": 2.6065118312835693, + "learning_rate": 9.831618996976633e-07, + "loss": 0.7752, + "step": 2342 + }, + { + "epoch": 0.8585562477097838, + "grad_norm": 2.428201675415039, + "learning_rate": 9.7817192058415e-07, + "loss": 0.7817, + "step": 2343 + }, + { + "epoch": 0.8589226823012093, + "grad_norm": 2.6025755405426025, + "learning_rate": 9.731939856115736e-07, + "loss": 0.803, + "step": 2344 + }, + { + "epoch": 0.8592891168926347, + "grad_norm": 2.837523937225342, + "learning_rate": 9.682281014254736e-07, + "loss": 0.7726, + "step": 2345 + }, + { + "epoch": 0.8596555514840601, + "grad_norm": 2.9446375370025635, + "learning_rate": 9.632742746553102e-07, + "loss": 0.7836, + "step": 2346 + }, + { + "epoch": 0.8600219860754855, + "grad_norm": 2.481795310974121, + "learning_rate": 9.583325119144382e-07, + "loss": 0.8244, + "step": 2347 + }, + { + "epoch": 0.8603884206669109, + "grad_norm": 2.6143958568573, + "learning_rate": 9.534028198001122e-07, + "loss": 0.8114, + "step": 2348 + }, + { + "epoch": 0.8607548552583364, + "grad_norm": 2.3166778087615967, + "learning_rate": 9.484852048934734e-07, + "loss": 0.8259, + "step": 2349 + }, + { + "epoch": 0.8611212898497618, + "grad_norm": 2.3150179386138916, + "learning_rate": 9.435796737595382e-07, + "loss": 0.8087, + "step": 2350 + }, + { + "epoch": 0.8614877244411873, + "grad_norm": 2.456609010696411, + "learning_rate": 9.386862329471869e-07, + "loss": 0.8285, + "step": 2351 + }, + { + "epoch": 0.8618541590326126, + "grad_norm": 2.5039875507354736, + "learning_rate": 9.33804888989166e-07, + "loss": 0.8021, + "step": 2352 + }, + { + "epoch": 0.8622205936240381, + "grad_norm": 2.201730489730835, + "learning_rate": 9.289356484020706e-07, + "loss": 0.816, + "step": 2353 + }, + { + "epoch": 0.8625870282154635, + "grad_norm": 2.5118815898895264, + "learning_rate": 9.240785176863354e-07, + "loss": 0.8319, + "step": 2354 + }, + { + "epoch": 0.862953462806889, + "grad_norm": 2.379840850830078, + "learning_rate": 9.192335033262334e-07, + "loss": 0.8279, + "step": 2355 + }, + { + "epoch": 0.8633198973983144, + "grad_norm": 2.3860526084899902, + "learning_rate": 9.144006117898541e-07, + "loss": 0.8034, + "step": 2356 + }, + { + "epoch": 0.8636863319897399, + "grad_norm": 2.636835813522339, + "learning_rate": 9.095798495291119e-07, + "loss": 0.7626, + "step": 2357 + }, + { + "epoch": 0.8640527665811653, + "grad_norm": 2.058418035507202, + "learning_rate": 9.047712229797224e-07, + "loss": 0.7966, + "step": 2358 + }, + { + "epoch": 0.8644192011725907, + "grad_norm": 2.3849425315856934, + "learning_rate": 8.999747385612023e-07, + "loss": 0.8084, + "step": 2359 + }, + { + "epoch": 0.8647856357640161, + "grad_norm": 2.908236026763916, + "learning_rate": 8.951904026768621e-07, + "loss": 0.7261, + "step": 2360 + }, + { + "epoch": 0.8651520703554415, + "grad_norm": 2.460125207901001, + "learning_rate": 8.904182217137847e-07, + "loss": 0.8432, + "step": 2361 + }, + { + "epoch": 0.865518504946867, + "grad_norm": 2.6111361980438232, + "learning_rate": 8.856582020428362e-07, + "loss": 0.7894, + "step": 2362 + }, + { + "epoch": 0.8658849395382924, + "grad_norm": 2.1920156478881836, + "learning_rate": 8.809103500186411e-07, + "loss": 0.8271, + "step": 2363 + }, + { + "epoch": 0.8662513741297179, + "grad_norm": 2.735114574432373, + "learning_rate": 8.761746719795827e-07, + "loss": 0.7336, + "step": 2364 + }, + { + "epoch": 0.8666178087211432, + "grad_norm": 2.2781002521514893, + "learning_rate": 8.714511742477927e-07, + "loss": 0.8652, + "step": 2365 + }, + { + "epoch": 0.8669842433125687, + "grad_norm": 2.6231000423431396, + "learning_rate": 8.66739863129139e-07, + "loss": 0.8134, + "step": 2366 + }, + { + "epoch": 0.8673506779039941, + "grad_norm": 2.858731746673584, + "learning_rate": 8.620407449132218e-07, + "loss": 0.7618, + "step": 2367 + }, + { + "epoch": 0.8677171124954196, + "grad_norm": 2.5288403034210205, + "learning_rate": 8.573538258733671e-07, + "loss": 0.7656, + "step": 2368 + }, + { + "epoch": 0.868083547086845, + "grad_norm": 2.189743995666504, + "learning_rate": 8.52679112266609e-07, + "loss": 0.8046, + "step": 2369 + }, + { + "epoch": 0.8684499816782705, + "grad_norm": 2.4853451251983643, + "learning_rate": 8.480166103336918e-07, + "loss": 0.7811, + "step": 2370 + }, + { + "epoch": 0.8688164162696959, + "grad_norm": 2.425382614135742, + "learning_rate": 8.433663262990544e-07, + "loss": 0.7532, + "step": 2371 + }, + { + "epoch": 0.8691828508611212, + "grad_norm": 2.3846395015716553, + "learning_rate": 8.387282663708285e-07, + "loss": 0.8075, + "step": 2372 + }, + { + "epoch": 0.8695492854525467, + "grad_norm": 2.588106870651245, + "learning_rate": 8.341024367408213e-07, + "loss": 0.8226, + "step": 2373 + }, + { + "epoch": 0.8699157200439721, + "grad_norm": 2.4514048099517822, + "learning_rate": 8.294888435845183e-07, + "loss": 0.8669, + "step": 2374 + }, + { + "epoch": 0.8702821546353976, + "grad_norm": 2.1290717124938965, + "learning_rate": 8.248874930610674e-07, + "loss": 0.838, + "step": 2375 + }, + { + "epoch": 0.870648589226823, + "grad_norm": 2.3119587898254395, + "learning_rate": 8.20298391313269e-07, + "loss": 0.8382, + "step": 2376 + }, + { + "epoch": 0.8710150238182485, + "grad_norm": 2.4327573776245117, + "learning_rate": 8.157215444675737e-07, + "loss": 0.8515, + "step": 2377 + }, + { + "epoch": 0.8713814584096738, + "grad_norm": 2.836103677749634, + "learning_rate": 8.111569586340751e-07, + "loss": 0.7742, + "step": 2378 + }, + { + "epoch": 0.8717478930010993, + "grad_norm": 2.512967824935913, + "learning_rate": 8.066046399064953e-07, + "loss": 0.8043, + "step": 2379 + }, + { + "epoch": 0.8721143275925247, + "grad_norm": 2.3081233501434326, + "learning_rate": 8.020645943621774e-07, + "loss": 0.843, + "step": 2380 + }, + { + "epoch": 0.8724807621839502, + "grad_norm": 2.9423413276672363, + "learning_rate": 7.975368280620865e-07, + "loss": 0.7945, + "step": 2381 + }, + { + "epoch": 0.8728471967753756, + "grad_norm": 2.687516450881958, + "learning_rate": 7.930213470507864e-07, + "loss": 0.7514, + "step": 2382 + }, + { + "epoch": 0.8732136313668011, + "grad_norm": 2.2465670108795166, + "learning_rate": 7.885181573564449e-07, + "loss": 0.7975, + "step": 2383 + }, + { + "epoch": 0.8735800659582265, + "grad_norm": 2.3269989490509033, + "learning_rate": 7.840272649908232e-07, + "loss": 0.8088, + "step": 2384 + }, + { + "epoch": 0.8739465005496518, + "grad_norm": 2.287290096282959, + "learning_rate": 7.795486759492621e-07, + "loss": 0.8352, + "step": 2385 + }, + { + "epoch": 0.8743129351410773, + "grad_norm": 2.7851336002349854, + "learning_rate": 7.750823962106779e-07, + "loss": 0.7436, + "step": 2386 + }, + { + "epoch": 0.8746793697325027, + "grad_norm": 2.395256519317627, + "learning_rate": 7.706284317375534e-07, + "loss": 0.8167, + "step": 2387 + }, + { + "epoch": 0.8750458043239282, + "grad_norm": 2.470905303955078, + "learning_rate": 7.661867884759321e-07, + "loss": 0.7964, + "step": 2388 + }, + { + "epoch": 0.8754122389153536, + "grad_norm": 2.294090986251831, + "learning_rate": 7.617574723554078e-07, + "loss": 0.8194, + "step": 2389 + }, + { + "epoch": 0.8757786735067791, + "grad_norm": 2.5644469261169434, + "learning_rate": 7.57340489289119e-07, + "loss": 0.8178, + "step": 2390 + }, + { + "epoch": 0.8761451080982045, + "grad_norm": 2.3230745792388916, + "learning_rate": 7.529358451737368e-07, + "loss": 0.8578, + "step": 2391 + }, + { + "epoch": 0.8765115426896299, + "grad_norm": 2.285337448120117, + "learning_rate": 7.485435458894619e-07, + "loss": 0.7574, + "step": 2392 + }, + { + "epoch": 0.8768779772810553, + "grad_norm": 2.14737606048584, + "learning_rate": 7.441635973000127e-07, + "loss": 0.8185, + "step": 2393 + }, + { + "epoch": 0.8772444118724808, + "grad_norm": 2.1775426864624023, + "learning_rate": 7.397960052526221e-07, + "loss": 0.8213, + "step": 2394 + }, + { + "epoch": 0.8776108464639062, + "grad_norm": 2.196223497390747, + "learning_rate": 7.354407755780257e-07, + "loss": 0.8192, + "step": 2395 + }, + { + "epoch": 0.8779772810553316, + "grad_norm": 2.274160385131836, + "learning_rate": 7.310979140904572e-07, + "loss": 0.8638, + "step": 2396 + }, + { + "epoch": 0.8783437156467571, + "grad_norm": 2.5875229835510254, + "learning_rate": 7.267674265876324e-07, + "loss": 0.8277, + "step": 2397 + }, + { + "epoch": 0.8787101502381824, + "grad_norm": 2.819317579269409, + "learning_rate": 7.224493188507553e-07, + "loss": 0.7778, + "step": 2398 + }, + { + "epoch": 0.8790765848296079, + "grad_norm": 2.390894651412964, + "learning_rate": 7.181435966444994e-07, + "loss": 0.804, + "step": 2399 + }, + { + "epoch": 0.8794430194210333, + "grad_norm": 2.9968502521514893, + "learning_rate": 7.138502657170043e-07, + "loss": 0.7772, + "step": 2400 + }, + { + "epoch": 0.8798094540124588, + "grad_norm": 2.596324920654297, + "learning_rate": 7.095693317998675e-07, + "loss": 0.7983, + "step": 2401 + }, + { + "epoch": 0.8801758886038842, + "grad_norm": 2.346898078918457, + "learning_rate": 7.053008006081341e-07, + "loss": 0.8041, + "step": 2402 + }, + { + "epoch": 0.8805423231953097, + "grad_norm": 2.4089863300323486, + "learning_rate": 7.010446778402968e-07, + "loss": 0.7913, + "step": 2403 + }, + { + "epoch": 0.880908757786735, + "grad_norm": 2.104218006134033, + "learning_rate": 6.968009691782763e-07, + "loss": 0.8543, + "step": 2404 + }, + { + "epoch": 0.8812751923781605, + "grad_norm": 2.470045566558838, + "learning_rate": 6.925696802874238e-07, + "loss": 0.7533, + "step": 2405 + }, + { + "epoch": 0.8816416269695859, + "grad_norm": 2.4869046211242676, + "learning_rate": 6.883508168165143e-07, + "loss": 0.8178, + "step": 2406 + }, + { + "epoch": 0.8820080615610114, + "grad_norm": 2.484778881072998, + "learning_rate": 6.841443843977258e-07, + "loss": 0.8237, + "step": 2407 + }, + { + "epoch": 0.8823744961524368, + "grad_norm": 2.4106252193450928, + "learning_rate": 6.799503886466485e-07, + "loss": 0.8261, + "step": 2408 + }, + { + "epoch": 0.8827409307438622, + "grad_norm": 2.831423759460449, + "learning_rate": 6.75768835162266e-07, + "loss": 0.7781, + "step": 2409 + }, + { + "epoch": 0.8831073653352877, + "grad_norm": 2.7222824096679688, + "learning_rate": 6.715997295269528e-07, + "loss": 0.7915, + "step": 2410 + }, + { + "epoch": 0.883473799926713, + "grad_norm": 2.098013401031494, + "learning_rate": 6.674430773064656e-07, + "loss": 0.8678, + "step": 2411 + }, + { + "epoch": 0.8838402345181385, + "grad_norm": 2.491773843765259, + "learning_rate": 6.632988840499355e-07, + "loss": 0.7576, + "step": 2412 + }, + { + "epoch": 0.8842066691095639, + "grad_norm": 2.345390558242798, + "learning_rate": 6.591671552898593e-07, + "loss": 0.8643, + "step": 2413 + }, + { + "epoch": 0.8845731037009894, + "grad_norm": 2.5588772296905518, + "learning_rate": 6.55047896542097e-07, + "loss": 0.8814, + "step": 2414 + }, + { + "epoch": 0.8849395382924148, + "grad_norm": 2.4958090782165527, + "learning_rate": 6.509411133058574e-07, + "loss": 0.827, + "step": 2415 + }, + { + "epoch": 0.8853059728838403, + "grad_norm": 2.6431095600128174, + "learning_rate": 6.468468110636961e-07, + "loss": 0.8081, + "step": 2416 + }, + { + "epoch": 0.8856724074752657, + "grad_norm": 2.8156349658966064, + "learning_rate": 6.427649952815096e-07, + "loss": 0.7839, + "step": 2417 + }, + { + "epoch": 0.8860388420666911, + "grad_norm": 2.345520496368408, + "learning_rate": 6.386956714085191e-07, + "loss": 0.8431, + "step": 2418 + }, + { + "epoch": 0.8864052766581165, + "grad_norm": 2.677828788757324, + "learning_rate": 6.346388448772722e-07, + "loss": 0.7809, + "step": 2419 + }, + { + "epoch": 0.8867717112495419, + "grad_norm": 3.008817195892334, + "learning_rate": 6.305945211036347e-07, + "loss": 0.7315, + "step": 2420 + }, + { + "epoch": 0.8871381458409674, + "grad_norm": 2.964496612548828, + "learning_rate": 6.265627054867773e-07, + "loss": 0.7653, + "step": 2421 + }, + { + "epoch": 0.8875045804323928, + "grad_norm": 2.6443655490875244, + "learning_rate": 6.22543403409177e-07, + "loss": 0.7382, + "step": 2422 + }, + { + "epoch": 0.8878710150238183, + "grad_norm": 2.525057792663574, + "learning_rate": 6.185366202365984e-07, + "loss": 0.8576, + "step": 2423 + }, + { + "epoch": 0.8882374496152436, + "grad_norm": 2.3738057613372803, + "learning_rate": 6.145423613180989e-07, + "loss": 0.8157, + "step": 2424 + }, + { + "epoch": 0.8886038842066691, + "grad_norm": 2.2518320083618164, + "learning_rate": 6.105606319860157e-07, + "loss": 0.8524, + "step": 2425 + }, + { + "epoch": 0.8889703187980945, + "grad_norm": 2.599637269973755, + "learning_rate": 6.065914375559556e-07, + "loss": 0.7961, + "step": 2426 + }, + { + "epoch": 0.88933675338952, + "grad_norm": 2.3115074634552, + "learning_rate": 6.026347833267942e-07, + "loss": 0.8109, + "step": 2427 + }, + { + "epoch": 0.8897031879809454, + "grad_norm": 2.493844509124756, + "learning_rate": 5.986906745806642e-07, + "loss": 0.7723, + "step": 2428 + }, + { + "epoch": 0.8900696225723709, + "grad_norm": 2.113201379776001, + "learning_rate": 5.947591165829514e-07, + "loss": 0.8207, + "step": 2429 + }, + { + "epoch": 0.8904360571637963, + "grad_norm": 2.5341408252716064, + "learning_rate": 5.908401145822862e-07, + "loss": 0.8079, + "step": 2430 + }, + { + "epoch": 0.8908024917552217, + "grad_norm": 2.5149288177490234, + "learning_rate": 5.869336738105369e-07, + "loss": 0.7806, + "step": 2431 + }, + { + "epoch": 0.8911689263466471, + "grad_norm": 2.887014389038086, + "learning_rate": 5.83039799482803e-07, + "loss": 0.7227, + "step": 2432 + }, + { + "epoch": 0.8915353609380725, + "grad_norm": 2.5421342849731445, + "learning_rate": 5.791584967974051e-07, + "loss": 0.8006, + "step": 2433 + }, + { + "epoch": 0.891901795529498, + "grad_norm": 2.474050283432007, + "learning_rate": 5.752897709358818e-07, + "loss": 0.8368, + "step": 2434 + }, + { + "epoch": 0.8922682301209234, + "grad_norm": 2.6157748699188232, + "learning_rate": 5.714336270629838e-07, + "loss": 0.8692, + "step": 2435 + }, + { + "epoch": 0.8926346647123489, + "grad_norm": 2.3381001949310303, + "learning_rate": 5.675900703266646e-07, + "loss": 0.8414, + "step": 2436 + }, + { + "epoch": 0.8930010993037742, + "grad_norm": 2.5202739238739014, + "learning_rate": 5.637591058580705e-07, + "loss": 0.8096, + "step": 2437 + }, + { + "epoch": 0.8933675338951997, + "grad_norm": 2.232762575149536, + "learning_rate": 5.599407387715405e-07, + "loss": 0.7977, + "step": 2438 + }, + { + "epoch": 0.8937339684866251, + "grad_norm": 2.746988296508789, + "learning_rate": 5.561349741645927e-07, + "loss": 0.7389, + "step": 2439 + }, + { + "epoch": 0.8941004030780506, + "grad_norm": 2.633068561553955, + "learning_rate": 5.523418171179252e-07, + "loss": 0.789, + "step": 2440 + }, + { + "epoch": 0.894466837669476, + "grad_norm": 2.6220340728759766, + "learning_rate": 5.485612726954015e-07, + "loss": 0.7741, + "step": 2441 + }, + { + "epoch": 0.8948332722609015, + "grad_norm": 2.605099678039551, + "learning_rate": 5.447933459440501e-07, + "loss": 0.8026, + "step": 2442 + }, + { + "epoch": 0.8951997068523269, + "grad_norm": 2.5063672065734863, + "learning_rate": 5.410380418940542e-07, + "loss": 0.7285, + "step": 2443 + }, + { + "epoch": 0.8955661414437522, + "grad_norm": 2.344224452972412, + "learning_rate": 5.37295365558741e-07, + "loss": 0.8282, + "step": 2444 + }, + { + "epoch": 0.8959325760351777, + "grad_norm": 2.5959107875823975, + "learning_rate": 5.33565321934586e-07, + "loss": 0.7285, + "step": 2445 + }, + { + "epoch": 0.8962990106266031, + "grad_norm": 2.6870009899139404, + "learning_rate": 5.298479160011982e-07, + "loss": 0.8097, + "step": 2446 + }, + { + "epoch": 0.8966654452180286, + "grad_norm": 2.544396162033081, + "learning_rate": 5.261431527213157e-07, + "loss": 0.8394, + "step": 2447 + }, + { + "epoch": 0.897031879809454, + "grad_norm": 2.3084309101104736, + "learning_rate": 5.224510370407943e-07, + "loss": 0.7856, + "step": 2448 + }, + { + "epoch": 0.8973983144008795, + "grad_norm": 2.6764068603515625, + "learning_rate": 5.187715738886112e-07, + "loss": 0.7997, + "step": 2449 + }, + { + "epoch": 0.8977647489923049, + "grad_norm": 2.511139392852783, + "learning_rate": 5.151047681768484e-07, + "loss": 0.8524, + "step": 2450 + }, + { + "epoch": 0.8981311835837303, + "grad_norm": 2.2220213413238525, + "learning_rate": 5.114506248006912e-07, + "loss": 0.8661, + "step": 2451 + }, + { + "epoch": 0.8984976181751557, + "grad_norm": 2.3586859703063965, + "learning_rate": 5.078091486384241e-07, + "loss": 0.7999, + "step": 2452 + }, + { + "epoch": 0.8988640527665812, + "grad_norm": 2.833298921585083, + "learning_rate": 5.041803445514159e-07, + "loss": 0.8183, + "step": 2453 + }, + { + "epoch": 0.8992304873580066, + "grad_norm": 2.7366554737091064, + "learning_rate": 5.005642173841196e-07, + "loss": 0.7463, + "step": 2454 + }, + { + "epoch": 0.899596921949432, + "grad_norm": 2.269247531890869, + "learning_rate": 4.969607719640668e-07, + "loss": 0.8269, + "step": 2455 + }, + { + "epoch": 0.8999633565408575, + "grad_norm": 2.2485527992248535, + "learning_rate": 4.93370013101856e-07, + "loss": 0.8307, + "step": 2456 + }, + { + "epoch": 0.9003297911322828, + "grad_norm": 2.685641288757324, + "learning_rate": 4.897919455911504e-07, + "loss": 0.8218, + "step": 2457 + }, + { + "epoch": 0.9006962257237083, + "grad_norm": 2.3350040912628174, + "learning_rate": 4.862265742086736e-07, + "loss": 0.813, + "step": 2458 + }, + { + "epoch": 0.9010626603151337, + "grad_norm": 2.218966245651245, + "learning_rate": 4.826739037141914e-07, + "loss": 0.8424, + "step": 2459 + }, + { + "epoch": 0.9014290949065592, + "grad_norm": 2.3188931941986084, + "learning_rate": 4.791339388505234e-07, + "loss": 0.8287, + "step": 2460 + }, + { + "epoch": 0.9017955294979846, + "grad_norm": 2.376735210418701, + "learning_rate": 4.756066843435203e-07, + "loss": 0.8246, + "step": 2461 + }, + { + "epoch": 0.9021619640894101, + "grad_norm": 2.435607671737671, + "learning_rate": 4.720921449020677e-07, + "loss": 0.8103, + "step": 2462 + }, + { + "epoch": 0.9025283986808355, + "grad_norm": 2.3804492950439453, + "learning_rate": 4.685903252180768e-07, + "loss": 0.8147, + "step": 2463 + }, + { + "epoch": 0.9028948332722609, + "grad_norm": 2.6249334812164307, + "learning_rate": 4.6510122996647477e-07, + "loss": 0.8083, + "step": 2464 + }, + { + "epoch": 0.9032612678636863, + "grad_norm": 2.683875560760498, + "learning_rate": 4.6162486380520676e-07, + "loss": 0.7687, + "step": 2465 + }, + { + "epoch": 0.9036277024551118, + "grad_norm": 2.6124651432037354, + "learning_rate": 4.581612313752193e-07, + "loss": 0.8229, + "step": 2466 + }, + { + "epoch": 0.9039941370465372, + "grad_norm": 2.606337308883667, + "learning_rate": 4.5471033730046353e-07, + "loss": 0.7641, + "step": 2467 + }, + { + "epoch": 0.9043605716379626, + "grad_norm": 2.6244354248046875, + "learning_rate": 4.512721861878844e-07, + "loss": 0.7512, + "step": 2468 + }, + { + "epoch": 0.9047270062293881, + "grad_norm": 2.6744701862335205, + "learning_rate": 4.478467826274102e-07, + "loss": 0.8043, + "step": 2469 + }, + { + "epoch": 0.9050934408208134, + "grad_norm": 2.865318536758423, + "learning_rate": 4.444341311919564e-07, + "loss": 0.8108, + "step": 2470 + }, + { + "epoch": 0.9054598754122389, + "grad_norm": 2.284832715988159, + "learning_rate": 4.4103423643741537e-07, + "loss": 0.7966, + "step": 2471 + }, + { + "epoch": 0.9058263100036643, + "grad_norm": 2.674898386001587, + "learning_rate": 4.37647102902643e-07, + "loss": 0.7783, + "step": 2472 + }, + { + "epoch": 0.9061927445950898, + "grad_norm": 2.2506966590881348, + "learning_rate": 4.3427273510946446e-07, + "loss": 0.8311, + "step": 2473 + }, + { + "epoch": 0.9065591791865152, + "grad_norm": 2.267212152481079, + "learning_rate": 4.309111375626618e-07, + "loss": 0.8903, + "step": 2474 + }, + { + "epoch": 0.9069256137779407, + "grad_norm": 2.3874728679656982, + "learning_rate": 4.2756231474996633e-07, + "loss": 0.8391, + "step": 2475 + }, + { + "epoch": 0.907292048369366, + "grad_norm": 2.156465768814087, + "learning_rate": 4.242262711420564e-07, + "loss": 0.8546, + "step": 2476 + }, + { + "epoch": 0.9076584829607915, + "grad_norm": 2.561232566833496, + "learning_rate": 4.2090301119255164e-07, + "loss": 0.7859, + "step": 2477 + }, + { + "epoch": 0.9080249175522169, + "grad_norm": 2.3131983280181885, + "learning_rate": 4.1759253933800423e-07, + "loss": 0.8125, + "step": 2478 + }, + { + "epoch": 0.9083913521436423, + "grad_norm": 2.7867021560668945, + "learning_rate": 4.1429485999789463e-07, + "loss": 0.7831, + "step": 2479 + }, + { + "epoch": 0.9087577867350678, + "grad_norm": 2.2129106521606445, + "learning_rate": 4.1100997757462344e-07, + "loss": 0.8181, + "step": 2480 + }, + { + "epoch": 0.9091242213264932, + "grad_norm": 2.780451536178589, + "learning_rate": 4.0773789645350945e-07, + "loss": 0.8104, + "step": 2481 + }, + { + "epoch": 0.9094906559179187, + "grad_norm": 2.6728110313415527, + "learning_rate": 4.044786210027829e-07, + "loss": 0.7943, + "step": 2482 + }, + { + "epoch": 0.909857090509344, + "grad_norm": 2.3612685203552246, + "learning_rate": 4.012321555735732e-07, + "loss": 0.8163, + "step": 2483 + }, + { + "epoch": 0.9102235251007695, + "grad_norm": 2.2369892597198486, + "learning_rate": 3.9799850449991574e-07, + "loss": 0.812, + "step": 2484 + }, + { + "epoch": 0.9105899596921949, + "grad_norm": 2.4772753715515137, + "learning_rate": 3.9477767209873065e-07, + "loss": 0.8274, + "step": 2485 + }, + { + "epoch": 0.9109563942836204, + "grad_norm": 2.409552812576294, + "learning_rate": 3.9156966266983175e-07, + "loss": 0.783, + "step": 2486 + }, + { + "epoch": 0.9113228288750458, + "grad_norm": 2.666396141052246, + "learning_rate": 3.883744804959122e-07, + "loss": 0.8067, + "step": 2487 + }, + { + "epoch": 0.9116892634664713, + "grad_norm": 2.2847819328308105, + "learning_rate": 3.851921298425398e-07, + "loss": 0.8637, + "step": 2488 + }, + { + "epoch": 0.9120556980578967, + "grad_norm": 2.1684303283691406, + "learning_rate": 3.8202261495815296e-07, + "loss": 0.7933, + "step": 2489 + }, + { + "epoch": 0.9124221326493221, + "grad_norm": 2.689586877822876, + "learning_rate": 3.7886594007405354e-07, + "loss": 0.7986, + "step": 2490 + }, + { + "epoch": 0.9127885672407475, + "grad_norm": 2.387988328933716, + "learning_rate": 3.757221094044028e-07, + "loss": 0.8394, + "step": 2491 + }, + { + "epoch": 0.9131550018321729, + "grad_norm": 2.657233953475952, + "learning_rate": 3.725911271462157e-07, + "loss": 0.8021, + "step": 2492 + }, + { + "epoch": 0.9135214364235984, + "grad_norm": 3.2616617679595947, + "learning_rate": 3.6947299747935427e-07, + "loss": 0.7529, + "step": 2493 + }, + { + "epoch": 0.9138878710150238, + "grad_norm": 2.5014402866363525, + "learning_rate": 3.663677245665198e-07, + "loss": 0.8123, + "step": 2494 + }, + { + "epoch": 0.9142543056064493, + "grad_norm": 3.2536182403564453, + "learning_rate": 3.6327531255325285e-07, + "loss": 0.75, + "step": 2495 + }, + { + "epoch": 0.9146207401978746, + "grad_norm": 2.8663833141326904, + "learning_rate": 3.6019576556792336e-07, + "loss": 0.7509, + "step": 2496 + }, + { + "epoch": 0.9149871747893001, + "grad_norm": 2.2998969554901123, + "learning_rate": 3.5712908772172596e-07, + "loss": 0.8589, + "step": 2497 + }, + { + "epoch": 0.9153536093807255, + "grad_norm": 2.583465337753296, + "learning_rate": 3.540752831086769e-07, + "loss": 0.7621, + "step": 2498 + }, + { + "epoch": 0.915720043972151, + "grad_norm": 2.637712001800537, + "learning_rate": 3.510343558056073e-07, + "loss": 0.8347, + "step": 2499 + }, + { + "epoch": 0.9160864785635764, + "grad_norm": 2.455714225769043, + "learning_rate": 3.4800630987215087e-07, + "loss": 0.7838, + "step": 2500 + }, + { + "epoch": 0.9164529131550019, + "grad_norm": 2.176424980163574, + "learning_rate": 3.4499114935075185e-07, + "loss": 0.8495, + "step": 2501 + }, + { + "epoch": 0.9168193477464273, + "grad_norm": 2.26564621925354, + "learning_rate": 3.4198887826664805e-07, + "loss": 0.8049, + "step": 2502 + }, + { + "epoch": 0.9171857823378526, + "grad_norm": 2.7091007232666016, + "learning_rate": 3.3899950062787343e-07, + "loss": 0.8091, + "step": 2503 + }, + { + "epoch": 0.9175522169292781, + "grad_norm": 2.7910430431365967, + "learning_rate": 3.360230204252468e-07, + "loss": 0.7887, + "step": 2504 + }, + { + "epoch": 0.9179186515207035, + "grad_norm": 2.3435018062591553, + "learning_rate": 3.330594416323696e-07, + "loss": 0.8108, + "step": 2505 + }, + { + "epoch": 0.918285086112129, + "grad_norm": 2.3385210037231445, + "learning_rate": 3.3010876820562033e-07, + "loss": 0.827, + "step": 2506 + }, + { + "epoch": 0.9186515207035544, + "grad_norm": 3.0472352504730225, + "learning_rate": 3.271710040841447e-07, + "loss": 0.6821, + "step": 2507 + }, + { + "epoch": 0.9190179552949799, + "grad_norm": 3.0347681045532227, + "learning_rate": 3.2424615318986e-07, + "loss": 0.8008, + "step": 2508 + }, + { + "epoch": 0.9193843898864053, + "grad_norm": 2.7286577224731445, + "learning_rate": 3.213342194274438e-07, + "loss": 0.8319, + "step": 2509 + }, + { + "epoch": 0.9197508244778307, + "grad_norm": 2.600947618484497, + "learning_rate": 3.184352066843266e-07, + "loss": 0.7789, + "step": 2510 + }, + { + "epoch": 0.9201172590692561, + "grad_norm": 2.3278796672821045, + "learning_rate": 3.1554911883068915e-07, + "loss": 0.8465, + "step": 2511 + }, + { + "epoch": 0.9204836936606816, + "grad_norm": 2.2521705627441406, + "learning_rate": 3.1267595971946176e-07, + "loss": 0.8016, + "step": 2512 + }, + { + "epoch": 0.920850128252107, + "grad_norm": 2.6614413261413574, + "learning_rate": 3.0981573318630943e-07, + "loss": 0.8114, + "step": 2513 + }, + { + "epoch": 0.9212165628435325, + "grad_norm": 2.2802844047546387, + "learning_rate": 3.0696844304963557e-07, + "loss": 0.7933, + "step": 2514 + }, + { + "epoch": 0.9215829974349579, + "grad_norm": 2.396064281463623, + "learning_rate": 3.0413409311057405e-07, + "loss": 0.8554, + "step": 2515 + }, + { + "epoch": 0.9219494320263832, + "grad_norm": 2.4574921131134033, + "learning_rate": 3.0131268715298034e-07, + "loss": 0.8178, + "step": 2516 + }, + { + "epoch": 0.9223158666178087, + "grad_norm": 2.4022841453552246, + "learning_rate": 2.9850422894343257e-07, + "loss": 0.7644, + "step": 2517 + }, + { + "epoch": 0.9226823012092341, + "grad_norm": 2.6130800247192383, + "learning_rate": 2.957087222312216e-07, + "loss": 0.8014, + "step": 2518 + }, + { + "epoch": 0.9230487358006596, + "grad_norm": 2.3932414054870605, + "learning_rate": 2.9292617074834995e-07, + "loss": 0.7709, + "step": 2519 + }, + { + "epoch": 0.923415170392085, + "grad_norm": 2.6321308612823486, + "learning_rate": 2.90156578209525e-07, + "loss": 0.7843, + "step": 2520 + }, + { + "epoch": 0.9237816049835105, + "grad_norm": 2.5634989738464355, + "learning_rate": 2.8739994831214923e-07, + "loss": 0.7625, + "step": 2521 + }, + { + "epoch": 0.9241480395749359, + "grad_norm": 2.321744441986084, + "learning_rate": 2.8465628473632656e-07, + "loss": 0.8626, + "step": 2522 + }, + { + "epoch": 0.9245144741663613, + "grad_norm": 2.713566541671753, + "learning_rate": 2.8192559114484817e-07, + "loss": 0.7424, + "step": 2523 + }, + { + "epoch": 0.9248809087577867, + "grad_norm": 2.113034963607788, + "learning_rate": 2.7920787118318917e-07, + "loss": 0.8691, + "step": 2524 + }, + { + "epoch": 0.9252473433492122, + "grad_norm": 2.3407957553863525, + "learning_rate": 2.7650312847950724e-07, + "loss": 0.8524, + "step": 2525 + }, + { + "epoch": 0.9256137779406376, + "grad_norm": 2.479217767715454, + "learning_rate": 2.738113666446307e-07, + "loss": 0.8401, + "step": 2526 + }, + { + "epoch": 0.925980212532063, + "grad_norm": 3.277244806289673, + "learning_rate": 2.711325892720651e-07, + "loss": 0.7803, + "step": 2527 + }, + { + "epoch": 0.9263466471234885, + "grad_norm": 2.369516611099243, + "learning_rate": 2.684667999379786e-07, + "loss": 0.8059, + "step": 2528 + }, + { + "epoch": 0.9267130817149138, + "grad_norm": 2.7027928829193115, + "learning_rate": 2.658140022011968e-07, + "loss": 0.7425, + "step": 2529 + }, + { + "epoch": 0.9270795163063393, + "grad_norm": 2.667623519897461, + "learning_rate": 2.6317419960320777e-07, + "loss": 0.7725, + "step": 2530 + }, + { + "epoch": 0.9274459508977647, + "grad_norm": 2.241896152496338, + "learning_rate": 2.60547395668147e-07, + "loss": 0.754, + "step": 2531 + }, + { + "epoch": 0.9278123854891902, + "grad_norm": 2.408830404281616, + "learning_rate": 2.5793359390279714e-07, + "loss": 0.8083, + "step": 2532 + }, + { + "epoch": 0.9281788200806156, + "grad_norm": 2.520829439163208, + "learning_rate": 2.553327977965847e-07, + "loss": 0.8242, + "step": 2533 + }, + { + "epoch": 0.9285452546720411, + "grad_norm": 2.2251739501953125, + "learning_rate": 2.5274501082157234e-07, + "loss": 0.8117, + "step": 2534 + }, + { + "epoch": 0.9289116892634665, + "grad_norm": 2.495321750640869, + "learning_rate": 2.501702364324554e-07, + "loss": 0.8041, + "step": 2535 + }, + { + "epoch": 0.929278123854892, + "grad_norm": 2.2292368412017822, + "learning_rate": 2.4760847806655997e-07, + "loss": 0.8338, + "step": 2536 + }, + { + "epoch": 0.9296445584463173, + "grad_norm": 2.5278828144073486, + "learning_rate": 2.450597391438303e-07, + "loss": 0.8465, + "step": 2537 + }, + { + "epoch": 0.9300109930377428, + "grad_norm": 3.1347997188568115, + "learning_rate": 2.425240230668358e-07, + "loss": 0.749, + "step": 2538 + }, + { + "epoch": 0.9303774276291682, + "grad_norm": 2.1962716579437256, + "learning_rate": 2.4000133322075515e-07, + "loss": 0.874, + "step": 2539 + }, + { + "epoch": 0.9307438622205936, + "grad_norm": 2.8005564212799072, + "learning_rate": 2.3749167297338117e-07, + "loss": 0.734, + "step": 2540 + }, + { + "epoch": 0.9311102968120191, + "grad_norm": 2.687960386276245, + "learning_rate": 2.3499504567511046e-07, + "loss": 0.8169, + "step": 2541 + }, + { + "epoch": 0.9314767314034444, + "grad_norm": 2.2408933639526367, + "learning_rate": 2.325114546589391e-07, + "loss": 0.7654, + "step": 2542 + }, + { + "epoch": 0.9318431659948699, + "grad_norm": 2.455065965652466, + "learning_rate": 2.3004090324046268e-07, + "loss": 0.8108, + "step": 2543 + }, + { + "epoch": 0.9322096005862953, + "grad_norm": 2.5029056072235107, + "learning_rate": 2.2758339471786628e-07, + "loss": 0.8277, + "step": 2544 + }, + { + "epoch": 0.9325760351777208, + "grad_norm": 3.1110899448394775, + "learning_rate": 2.2513893237192442e-07, + "loss": 0.8104, + "step": 2545 + }, + { + "epoch": 0.9329424697691462, + "grad_norm": 2.3951544761657715, + "learning_rate": 2.2270751946599556e-07, + "loss": 0.8182, + "step": 2546 + }, + { + "epoch": 0.9333089043605717, + "grad_norm": 2.4128873348236084, + "learning_rate": 2.2028915924601436e-07, + "loss": 0.833, + "step": 2547 + }, + { + "epoch": 0.9336753389519971, + "grad_norm": 3.34865140914917, + "learning_rate": 2.178838549404916e-07, + "loss": 0.7215, + "step": 2548 + }, + { + "epoch": 0.9340417735434225, + "grad_norm": 3.0843262672424316, + "learning_rate": 2.1549160976051088e-07, + "loss": 0.7681, + "step": 2549 + }, + { + "epoch": 0.9344082081348479, + "grad_norm": 2.328216552734375, + "learning_rate": 2.1311242689971757e-07, + "loss": 0.7778, + "step": 2550 + }, + { + "epoch": 0.9347746427262733, + "grad_norm": 2.731987476348877, + "learning_rate": 2.1074630953432208e-07, + "loss": 0.7579, + "step": 2551 + }, + { + "epoch": 0.9351410773176988, + "grad_norm": 2.6157193183898926, + "learning_rate": 2.0839326082309097e-07, + "loss": 0.7839, + "step": 2552 + }, + { + "epoch": 0.9355075119091242, + "grad_norm": 2.2835865020751953, + "learning_rate": 2.060532839073437e-07, + "loss": 0.8243, + "step": 2553 + }, + { + "epoch": 0.9358739465005497, + "grad_norm": 2.4783785343170166, + "learning_rate": 2.0372638191095028e-07, + "loss": 0.8011, + "step": 2554 + }, + { + "epoch": 0.936240381091975, + "grad_norm": 2.413538932800293, + "learning_rate": 2.0141255794032477e-07, + "loss": 0.8796, + "step": 2555 + }, + { + "epoch": 0.9366068156834005, + "grad_norm": 2.5933644771575928, + "learning_rate": 1.991118150844229e-07, + "loss": 0.8133, + "step": 2556 + }, + { + "epoch": 0.9369732502748259, + "grad_norm": 2.2094247341156006, + "learning_rate": 1.968241564147344e-07, + "loss": 0.7754, + "step": 2557 + }, + { + "epoch": 0.9373396848662514, + "grad_norm": 2.5051934719085693, + "learning_rate": 1.945495849852863e-07, + "loss": 0.7668, + "step": 2558 + }, + { + "epoch": 0.9377061194576768, + "grad_norm": 2.1419708728790283, + "learning_rate": 1.9228810383262964e-07, + "loss": 0.8236, + "step": 2559 + }, + { + "epoch": 0.9380725540491023, + "grad_norm": 2.4711034297943115, + "learning_rate": 1.900397159758427e-07, + "loss": 0.8673, + "step": 2560 + }, + { + "epoch": 0.9384389886405277, + "grad_norm": 2.5989861488342285, + "learning_rate": 1.8780442441652335e-07, + "loss": 0.7914, + "step": 2561 + }, + { + "epoch": 0.9388054232319532, + "grad_norm": 2.1352574825286865, + "learning_rate": 1.855822321387868e-07, + "loss": 0.8208, + "step": 2562 + }, + { + "epoch": 0.9391718578233785, + "grad_norm": 2.5791261196136475, + "learning_rate": 1.8337314210925994e-07, + "loss": 0.8302, + "step": 2563 + }, + { + "epoch": 0.9395382924148039, + "grad_norm": 2.4570558071136475, + "learning_rate": 1.8117715727707596e-07, + "loss": 0.8385, + "step": 2564 + }, + { + "epoch": 0.9399047270062294, + "grad_norm": 2.640968084335327, + "learning_rate": 1.7899428057387536e-07, + "loss": 0.8168, + "step": 2565 + }, + { + "epoch": 0.9402711615976548, + "grad_norm": 2.4151952266693115, + "learning_rate": 1.7682451491379927e-07, + "loss": 0.813, + "step": 2566 + }, + { + "epoch": 0.9406375961890803, + "grad_norm": 2.6391801834106445, + "learning_rate": 1.7466786319348505e-07, + "loss": 0.7144, + "step": 2567 + }, + { + "epoch": 0.9410040307805057, + "grad_norm": 2.176057815551758, + "learning_rate": 1.72524328292063e-07, + "loss": 0.9107, + "step": 2568 + }, + { + "epoch": 0.9413704653719311, + "grad_norm": 2.6201252937316895, + "learning_rate": 1.7039391307115184e-07, + "loss": 0.76, + "step": 2569 + }, + { + "epoch": 0.9417368999633565, + "grad_norm": 2.5767018795013428, + "learning_rate": 1.6827662037485537e-07, + "loss": 0.8244, + "step": 2570 + }, + { + "epoch": 0.942103334554782, + "grad_norm": 2.420722246170044, + "learning_rate": 1.6617245302976147e-07, + "loss": 0.7693, + "step": 2571 + }, + { + "epoch": 0.9424697691462074, + "grad_norm": 2.330512046813965, + "learning_rate": 1.6408141384493203e-07, + "loss": 0.8218, + "step": 2572 + }, + { + "epoch": 0.9428362037376329, + "grad_norm": 2.759117364883423, + "learning_rate": 1.6200350561190404e-07, + "loss": 0.7241, + "step": 2573 + }, + { + "epoch": 0.9432026383290583, + "grad_norm": 2.423530101776123, + "learning_rate": 1.5993873110468627e-07, + "loss": 0.8249, + "step": 2574 + }, + { + "epoch": 0.9435690729204836, + "grad_norm": 2.5526938438415527, + "learning_rate": 1.578870930797538e-07, + "loss": 0.809, + "step": 2575 + }, + { + "epoch": 0.9439355075119091, + "grad_norm": 2.314013719558716, + "learning_rate": 1.5584859427604017e-07, + "loss": 0.8529, + "step": 2576 + }, + { + "epoch": 0.9443019421033345, + "grad_norm": 2.5163190364837646, + "learning_rate": 1.5382323741494508e-07, + "loss": 0.7894, + "step": 2577 + }, + { + "epoch": 0.94466837669476, + "grad_norm": 2.736426591873169, + "learning_rate": 1.5181102520031688e-07, + "loss": 0.8482, + "step": 2578 + }, + { + "epoch": 0.9450348112861854, + "grad_norm": 2.334437370300293, + "learning_rate": 1.4981196031846113e-07, + "loss": 0.8432, + "step": 2579 + }, + { + "epoch": 0.9454012458776109, + "grad_norm": 2.745434284210205, + "learning_rate": 1.478260454381286e-07, + "loss": 0.8045, + "step": 2580 + }, + { + "epoch": 0.9457676804690363, + "grad_norm": 2.711455821990967, + "learning_rate": 1.458532832105153e-07, + "loss": 0.759, + "step": 2581 + }, + { + "epoch": 0.9461341150604617, + "grad_norm": 2.373372793197632, + "learning_rate": 1.438936762692611e-07, + "loss": 0.8101, + "step": 2582 + }, + { + "epoch": 0.9465005496518871, + "grad_norm": 2.2710046768188477, + "learning_rate": 1.4194722723043785e-07, + "loss": 0.839, + "step": 2583 + }, + { + "epoch": 0.9468669842433126, + "grad_norm": 2.3727807998657227, + "learning_rate": 1.4001393869255808e-07, + "loss": 0.7941, + "step": 2584 + }, + { + "epoch": 0.947233418834738, + "grad_norm": 2.5571014881134033, + "learning_rate": 1.3809381323656058e-07, + "loss": 0.761, + "step": 2585 + }, + { + "epoch": 0.9475998534261634, + "grad_norm": 2.3163387775421143, + "learning_rate": 1.3618685342581484e-07, + "loss": 0.8199, + "step": 2586 + }, + { + "epoch": 0.9479662880175889, + "grad_norm": 2.1683309078216553, + "learning_rate": 1.3429306180611e-07, + "loss": 0.8718, + "step": 2587 + }, + { + "epoch": 0.9483327226090142, + "grad_norm": 2.4830777645111084, + "learning_rate": 1.324124409056593e-07, + "loss": 0.7473, + "step": 2588 + }, + { + "epoch": 0.9486991572004397, + "grad_norm": 2.4777321815490723, + "learning_rate": 1.3054499323508995e-07, + "loss": 0.8145, + "step": 2589 + }, + { + "epoch": 0.9490655917918651, + "grad_norm": 2.565575361251831, + "learning_rate": 1.2869072128744663e-07, + "loss": 0.7228, + "step": 2590 + }, + { + "epoch": 0.9494320263832906, + "grad_norm": 2.5673301219940186, + "learning_rate": 1.2684962753818252e-07, + "loss": 0.8402, + "step": 2591 + }, + { + "epoch": 0.949798460974716, + "grad_norm": 2.938429594039917, + "learning_rate": 1.250217144451571e-07, + "loss": 0.727, + "step": 2592 + }, + { + "epoch": 0.9501648955661415, + "grad_norm": 2.7190816402435303, + "learning_rate": 1.23206984448635e-07, + "loss": 0.7654, + "step": 2593 + }, + { + "epoch": 0.9505313301575669, + "grad_norm": 2.4703621864318848, + "learning_rate": 1.214054399712794e-07, + "loss": 0.8063, + "step": 2594 + }, + { + "epoch": 0.9508977647489923, + "grad_norm": 2.362274408340454, + "learning_rate": 1.1961708341815316e-07, + "loss": 0.7834, + "step": 2595 + }, + { + "epoch": 0.9512641993404177, + "grad_norm": 2.552286386489868, + "learning_rate": 1.1784191717671312e-07, + "loss": 0.8552, + "step": 2596 + }, + { + "epoch": 0.9516306339318432, + "grad_norm": 2.515406847000122, + "learning_rate": 1.1607994361680585e-07, + "loss": 0.8017, + "step": 2597 + }, + { + "epoch": 0.9519970685232686, + "grad_norm": 2.715442180633545, + "learning_rate": 1.1433116509066644e-07, + "loss": 0.7763, + "step": 2598 + }, + { + "epoch": 0.952363503114694, + "grad_norm": 2.3628156185150146, + "learning_rate": 1.1259558393291292e-07, + "loss": 0.825, + "step": 2599 + }, + { + "epoch": 0.9527299377061195, + "grad_norm": 2.3146297931671143, + "learning_rate": 1.1087320246054634e-07, + "loss": 0.8421, + "step": 2600 + }, + { + "epoch": 0.9530963722975448, + "grad_norm": 2.279325485229492, + "learning_rate": 1.0916402297294848e-07, + "loss": 0.8499, + "step": 2601 + }, + { + "epoch": 0.9534628068889703, + "grad_norm": 2.527597427368164, + "learning_rate": 1.0746804775187081e-07, + "loss": 0.7355, + "step": 2602 + }, + { + "epoch": 0.9538292414803957, + "grad_norm": 2.3891003131866455, + "learning_rate": 1.0578527906144332e-07, + "loss": 0.7888, + "step": 2603 + }, + { + "epoch": 0.9541956760718212, + "grad_norm": 2.7313292026519775, + "learning_rate": 1.0411571914816121e-07, + "loss": 0.8209, + "step": 2604 + }, + { + "epoch": 0.9545621106632466, + "grad_norm": 2.987650156021118, + "learning_rate": 1.0245937024088493e-07, + "loss": 0.8033, + "step": 2605 + }, + { + "epoch": 0.9549285452546721, + "grad_norm": 2.2838006019592285, + "learning_rate": 1.0081623455084344e-07, + "loss": 0.8121, + "step": 2606 + }, + { + "epoch": 0.9552949798460975, + "grad_norm": 2.8768584728240967, + "learning_rate": 9.918631427162207e-08, + "loss": 0.725, + "step": 2607 + }, + { + "epoch": 0.955661414437523, + "grad_norm": 2.493642807006836, + "learning_rate": 9.756961157916355e-08, + "loss": 0.784, + "step": 2608 + }, + { + "epoch": 0.9560278490289483, + "grad_norm": 2.661923885345459, + "learning_rate": 9.596612863176702e-08, + "loss": 0.7796, + "step": 2609 + }, + { + "epoch": 0.9563942836203737, + "grad_norm": 2.2444090843200684, + "learning_rate": 9.437586757008121e-08, + "loss": 0.7572, + "step": 2610 + }, + { + "epoch": 0.9567607182117992, + "grad_norm": 2.4184560775756836, + "learning_rate": 9.27988305171057e-08, + "loss": 0.7662, + "step": 2611 + }, + { + "epoch": 0.9571271528032246, + "grad_norm": 2.5148234367370605, + "learning_rate": 9.123501957818415e-08, + "loss": 0.8242, + "step": 2612 + }, + { + "epoch": 0.9574935873946501, + "grad_norm": 3.0104429721832275, + "learning_rate": 8.968443684100659e-08, + "loss": 0.7282, + "step": 2613 + }, + { + "epoch": 0.9578600219860754, + "grad_norm": 2.3206191062927246, + "learning_rate": 8.814708437559827e-08, + "loss": 0.8091, + "step": 2614 + }, + { + "epoch": 0.9582264565775009, + "grad_norm": 2.781954288482666, + "learning_rate": 8.662296423432526e-08, + "loss": 0.8469, + "step": 2615 + }, + { + "epoch": 0.9585928911689263, + "grad_norm": 2.618665933609009, + "learning_rate": 8.511207845188884e-08, + "loss": 0.7353, + "step": 2616 + }, + { + "epoch": 0.9589593257603518, + "grad_norm": 2.366697311401367, + "learning_rate": 8.361442904532114e-08, + "loss": 0.8014, + "step": 2617 + }, + { + "epoch": 0.9593257603517772, + "grad_norm": 2.34576153755188, + "learning_rate": 8.213001801398501e-08, + "loss": 0.761, + "step": 2618 + }, + { + "epoch": 0.9596921949432027, + "grad_norm": 2.4065608978271484, + "learning_rate": 8.065884733956864e-08, + "loss": 0.8039, + "step": 2619 + }, + { + "epoch": 0.9600586295346281, + "grad_norm": 2.665904998779297, + "learning_rate": 7.920091898608429e-08, + "loss": 0.7685, + "step": 2620 + }, + { + "epoch": 0.9604250641260536, + "grad_norm": 2.423003673553467, + "learning_rate": 7.775623489986728e-08, + "loss": 0.8128, + "step": 2621 + }, + { + "epoch": 0.9607914987174789, + "grad_norm": 2.3617613315582275, + "learning_rate": 7.632479700957152e-08, + "loss": 0.7785, + "step": 2622 + }, + { + "epoch": 0.9611579333089043, + "grad_norm": 2.3319296836853027, + "learning_rate": 7.490660722616505e-08, + "loss": 0.8372, + "step": 2623 + }, + { + "epoch": 0.9615243679003298, + "grad_norm": 2.333374500274658, + "learning_rate": 7.35016674429323e-08, + "loss": 0.8241, + "step": 2624 + }, + { + "epoch": 0.9618908024917552, + "grad_norm": 2.2863078117370605, + "learning_rate": 7.21099795354674e-08, + "loss": 0.8183, + "step": 2625 + }, + { + "epoch": 0.9622572370831807, + "grad_norm": 2.8025894165039062, + "learning_rate": 7.073154536167415e-08, + "loss": 0.7713, + "step": 2626 + }, + { + "epoch": 0.962623671674606, + "grad_norm": 2.2680001258850098, + "learning_rate": 6.936636676176278e-08, + "loss": 0.8475, + "step": 2627 + }, + { + "epoch": 0.9629901062660315, + "grad_norm": 2.855222225189209, + "learning_rate": 6.801444555824543e-08, + "loss": 0.7444, + "step": 2628 + }, + { + "epoch": 0.9633565408574569, + "grad_norm": 2.8795368671417236, + "learning_rate": 6.667578355593951e-08, + "loss": 0.6874, + "step": 2629 + }, + { + "epoch": 0.9637229754488824, + "grad_norm": 2.410686731338501, + "learning_rate": 6.535038254195547e-08, + "loss": 0.7999, + "step": 2630 + }, + { + "epoch": 0.9640894100403078, + "grad_norm": 2.366168737411499, + "learning_rate": 6.403824428570571e-08, + "loss": 0.8394, + "step": 2631 + }, + { + "epoch": 0.9644558446317333, + "grad_norm": 2.3753418922424316, + "learning_rate": 6.273937053889456e-08, + "loss": 0.7992, + "step": 2632 + }, + { + "epoch": 0.9648222792231587, + "grad_norm": 2.5532946586608887, + "learning_rate": 6.145376303551942e-08, + "loss": 0.8015, + "step": 2633 + }, + { + "epoch": 0.965188713814584, + "grad_norm": 2.54056715965271, + "learning_rate": 6.018142349186518e-08, + "loss": 0.8013, + "step": 2634 + }, + { + "epoch": 0.9655551484060095, + "grad_norm": 2.5784850120544434, + "learning_rate": 5.8922353606504224e-08, + "loss": 0.8266, + "step": 2635 + }, + { + "epoch": 0.9659215829974349, + "grad_norm": 2.5065808296203613, + "learning_rate": 5.767655506029646e-08, + "loss": 0.8145, + "step": 2636 + }, + { + "epoch": 0.9662880175888604, + "grad_norm": 2.7400426864624023, + "learning_rate": 5.644402951638372e-08, + "loss": 0.7818, + "step": 2637 + }, + { + "epoch": 0.9666544521802858, + "grad_norm": 2.1992766857147217, + "learning_rate": 5.522477862018538e-08, + "loss": 0.8637, + "step": 2638 + }, + { + "epoch": 0.9670208867717113, + "grad_norm": 2.5704853534698486, + "learning_rate": 5.401880399940385e-08, + "loss": 0.7717, + "step": 2639 + }, + { + "epoch": 0.9673873213631367, + "grad_norm": 2.2856333255767822, + "learning_rate": 5.28261072640146e-08, + "loss": 0.8073, + "step": 2640 + }, + { + "epoch": 0.9677537559545621, + "grad_norm": 2.359670877456665, + "learning_rate": 5.164669000626621e-08, + "loss": 0.8069, + "step": 2641 + }, + { + "epoch": 0.9681201905459875, + "grad_norm": 2.493969678878784, + "learning_rate": 5.048055380068251e-08, + "loss": 0.8215, + "step": 2642 + }, + { + "epoch": 0.968486625137413, + "grad_norm": 2.170348882675171, + "learning_rate": 4.932770020405486e-08, + "loss": 0.8041, + "step": 2643 + }, + { + "epoch": 0.9688530597288384, + "grad_norm": 2.3696866035461426, + "learning_rate": 4.818813075544215e-08, + "loss": 0.8614, + "step": 2644 + }, + { + "epoch": 0.9692194943202639, + "grad_norm": 2.762871503829956, + "learning_rate": 4.706184697617189e-08, + "loss": 0.8588, + "step": 2645 + }, + { + "epoch": 0.9695859289116893, + "grad_norm": 2.926147222518921, + "learning_rate": 4.594885036983021e-08, + "loss": 0.7791, + "step": 2646 + }, + { + "epoch": 0.9699523635031146, + "grad_norm": 2.2866110801696777, + "learning_rate": 4.484914242226967e-08, + "loss": 0.7318, + "step": 2647 + }, + { + "epoch": 0.9703187980945401, + "grad_norm": 2.318859338760376, + "learning_rate": 4.376272460159925e-08, + "loss": 0.7696, + "step": 2648 + }, + { + "epoch": 0.9706852326859655, + "grad_norm": 2.466014862060547, + "learning_rate": 4.2689598358187646e-08, + "loss": 0.8135, + "step": 2649 + }, + { + "epoch": 0.971051667277391, + "grad_norm": 3.416146755218506, + "learning_rate": 4.1629765124656665e-08, + "loss": 0.7498, + "step": 2650 + }, + { + "epoch": 0.9714181018688164, + "grad_norm": 3.0079779624938965, + "learning_rate": 4.0583226315886736e-08, + "loss": 0.8217, + "step": 2651 + }, + { + "epoch": 0.9717845364602419, + "grad_norm": 2.374130964279175, + "learning_rate": 3.954998332900473e-08, + "loss": 0.7957, + "step": 2652 + }, + { + "epoch": 0.9721509710516673, + "grad_norm": 2.347937822341919, + "learning_rate": 3.853003754339057e-08, + "loss": 0.853, + "step": 2653 + }, + { + "epoch": 0.9725174056430927, + "grad_norm": 2.450432062149048, + "learning_rate": 3.752339032067287e-08, + "loss": 0.8669, + "step": 2654 + }, + { + "epoch": 0.9728838402345181, + "grad_norm": 3.1508584022521973, + "learning_rate": 3.653004300472551e-08, + "loss": 0.8051, + "step": 2655 + }, + { + "epoch": 0.9732502748259436, + "grad_norm": 2.312556505203247, + "learning_rate": 3.5549996921666605e-08, + "loss": 0.8128, + "step": 2656 + }, + { + "epoch": 0.973616709417369, + "grad_norm": 2.2593586444854736, + "learning_rate": 3.458325337985735e-08, + "loss": 0.8601, + "step": 2657 + }, + { + "epoch": 0.9739831440087944, + "grad_norm": 2.395019054412842, + "learning_rate": 3.362981366990092e-08, + "loss": 0.7701, + "step": 2658 + }, + { + "epoch": 0.9743495786002199, + "grad_norm": 2.3906631469726562, + "learning_rate": 3.2689679064640264e-08, + "loss": 0.8176, + "step": 2659 + }, + { + "epoch": 0.9747160131916452, + "grad_norm": 2.3747525215148926, + "learning_rate": 3.176285081915365e-08, + "loss": 0.7992, + "step": 2660 + }, + { + "epoch": 0.9750824477830707, + "grad_norm": 2.841897964477539, + "learning_rate": 3.084933017075797e-08, + "loss": 0.7881, + "step": 2661 + }, + { + "epoch": 0.9754488823744961, + "grad_norm": 2.2802422046661377, + "learning_rate": 2.994911833900216e-08, + "loss": 0.888, + "step": 2662 + }, + { + "epoch": 0.9758153169659216, + "grad_norm": 3.2933173179626465, + "learning_rate": 2.906221652567043e-08, + "loss": 0.7663, + "step": 2663 + }, + { + "epoch": 0.976181751557347, + "grad_norm": 2.538501024246216, + "learning_rate": 2.818862591477567e-08, + "loss": 0.779, + "step": 2664 + }, + { + "epoch": 0.9765481861487725, + "grad_norm": 2.564566135406494, + "learning_rate": 2.732834767256276e-08, + "loss": 0.8072, + "step": 2665 + }, + { + "epoch": 0.9769146207401979, + "grad_norm": 2.49530291557312, + "learning_rate": 2.648138294750191e-08, + "loss": 0.8292, + "step": 2666 + }, + { + "epoch": 0.9772810553316233, + "grad_norm": 2.2448463439941406, + "learning_rate": 2.5647732870290877e-08, + "loss": 0.7876, + "step": 2667 + }, + { + "epoch": 0.9776474899230487, + "grad_norm": 2.516613245010376, + "learning_rate": 2.4827398553852744e-08, + "loss": 0.7823, + "step": 2668 + }, + { + "epoch": 0.9780139245144742, + "grad_norm": 2.4173407554626465, + "learning_rate": 2.402038109333482e-08, + "loss": 0.8346, + "step": 2669 + }, + { + "epoch": 0.9783803591058996, + "grad_norm": 2.280996322631836, + "learning_rate": 2.3226681566105303e-08, + "loss": 0.8332, + "step": 2670 + }, + { + "epoch": 0.978746793697325, + "grad_norm": 2.479410409927368, + "learning_rate": 2.244630103175216e-08, + "loss": 0.8781, + "step": 2671 + }, + { + "epoch": 0.9791132282887505, + "grad_norm": 2.1932010650634766, + "learning_rate": 2.1679240532083147e-08, + "loss": 0.8298, + "step": 2672 + }, + { + "epoch": 0.9794796628801758, + "grad_norm": 2.397739887237549, + "learning_rate": 2.0925501091125788e-08, + "loss": 0.8105, + "step": 2673 + }, + { + "epoch": 0.9798460974716013, + "grad_norm": 2.362424612045288, + "learning_rate": 2.0185083715121845e-08, + "loss": 0.8627, + "step": 2674 + }, + { + "epoch": 0.9802125320630267, + "grad_norm": 2.419423818588257, + "learning_rate": 1.9457989392527298e-08, + "loss": 0.8018, + "step": 2675 + }, + { + "epoch": 0.9805789666544522, + "grad_norm": 2.3025901317596436, + "learning_rate": 1.8744219094013473e-08, + "loss": 0.805, + "step": 2676 + }, + { + "epoch": 0.9809454012458776, + "grad_norm": 2.2877860069274902, + "learning_rate": 1.8043773772464802e-08, + "loss": 0.7481, + "step": 2677 + }, + { + "epoch": 0.9813118358373031, + "grad_norm": 2.5664076805114746, + "learning_rate": 1.73566543629744e-08, + "loss": 0.8295, + "step": 2678 + }, + { + "epoch": 0.9816782704287285, + "grad_norm": 2.4994051456451416, + "learning_rate": 1.6682861782848503e-08, + "loss": 0.8511, + "step": 2679 + }, + { + "epoch": 0.982044705020154, + "grad_norm": 2.296548366546631, + "learning_rate": 1.6022396931598682e-08, + "loss": 0.8148, + "step": 2680 + }, + { + "epoch": 0.9824111396115793, + "grad_norm": 2.2606639862060547, + "learning_rate": 1.5375260690945194e-08, + "loss": 0.8016, + "step": 2681 + }, + { + "epoch": 0.9827775742030047, + "grad_norm": 2.6477155685424805, + "learning_rate": 1.4741453924816962e-08, + "loss": 0.7658, + "step": 2682 + }, + { + "epoch": 0.9831440087944302, + "grad_norm": 2.2549641132354736, + "learning_rate": 1.4120977479344933e-08, + "loss": 0.8707, + "step": 2683 + }, + { + "epoch": 0.9835104433858556, + "grad_norm": 2.463927745819092, + "learning_rate": 1.3513832182864284e-08, + "loss": 0.7888, + "step": 2684 + }, + { + "epoch": 0.9838768779772811, + "grad_norm": 2.3416614532470703, + "learning_rate": 1.2920018845915539e-08, + "loss": 0.7944, + "step": 2685 + }, + { + "epoch": 0.9842433125687065, + "grad_norm": 2.617210626602173, + "learning_rate": 1.2339538261239015e-08, + "loss": 0.8209, + "step": 2686 + }, + { + "epoch": 0.9846097471601319, + "grad_norm": 2.4247403144836426, + "learning_rate": 1.1772391203775934e-08, + "loss": 0.7562, + "step": 2687 + }, + { + "epoch": 0.9849761817515573, + "grad_norm": 2.344165325164795, + "learning_rate": 1.1218578430667315e-08, + "loss": 0.8384, + "step": 2688 + }, + { + "epoch": 0.9853426163429828, + "grad_norm": 2.7740414142608643, + "learning_rate": 1.0678100681255077e-08, + "loss": 0.8237, + "step": 2689 + }, + { + "epoch": 0.9857090509344082, + "grad_norm": 3.037654399871826, + "learning_rate": 1.0150958677075384e-08, + "loss": 0.6686, + "step": 2690 + }, + { + "epoch": 0.9860754855258337, + "grad_norm": 2.509370803833008, + "learning_rate": 9.637153121863085e-09, + "loss": 0.7722, + "step": 2691 + }, + { + "epoch": 0.9864419201172591, + "grad_norm": 2.465951442718506, + "learning_rate": 9.136684701548381e-09, + "loss": 0.7896, + "step": 2692 + }, + { + "epoch": 0.9868083547086846, + "grad_norm": 2.5230894088745117, + "learning_rate": 8.649554084256829e-09, + "loss": 0.8261, + "step": 2693 + }, + { + "epoch": 0.9871747893001099, + "grad_norm": 3.178312301635742, + "learning_rate": 8.175761920309332e-09, + "loss": 0.724, + "step": 2694 + }, + { + "epoch": 0.9875412238915353, + "grad_norm": 2.954385757446289, + "learning_rate": 7.715308842215496e-09, + "loss": 0.7312, + "step": 2695 + }, + { + "epoch": 0.9879076584829608, + "grad_norm": 2.2207462787628174, + "learning_rate": 7.268195464683603e-09, + "loss": 0.832, + "step": 2696 + }, + { + "epoch": 0.9882740930743862, + "grad_norm": 2.27506947517395, + "learning_rate": 6.834422384607298e-09, + "loss": 0.7862, + "step": 2697 + }, + { + "epoch": 0.9886405276658117, + "grad_norm": 2.9832675457000732, + "learning_rate": 6.41399018107447e-09, + "loss": 0.8149, + "step": 2698 + }, + { + "epoch": 0.989006962257237, + "grad_norm": 2.37428617477417, + "learning_rate": 6.00689941536281e-09, + "loss": 0.8005, + "step": 2699 + }, + { + "epoch": 0.9893733968486625, + "grad_norm": 2.8593597412109375, + "learning_rate": 5.613150630938702e-09, + "loss": 0.7504, + "step": 2700 + }, + { + "epoch": 0.9897398314400879, + "grad_norm": 2.6248154640197754, + "learning_rate": 5.2327443534549994e-09, + "loss": 0.8944, + "step": 2701 + }, + { + "epoch": 0.9901062660315134, + "grad_norm": 2.416533946990967, + "learning_rate": 4.86568109075547e-09, + "loss": 0.8241, + "step": 2702 + }, + { + "epoch": 0.9904727006229388, + "grad_norm": 2.352708578109741, + "learning_rate": 4.511961332869241e-09, + "loss": 0.8209, + "step": 2703 + }, + { + "epoch": 0.9908391352143643, + "grad_norm": 2.675865411758423, + "learning_rate": 4.1715855520130246e-09, + "loss": 0.7851, + "step": 2704 + }, + { + "epoch": 0.9912055698057897, + "grad_norm": 2.2010722160339355, + "learning_rate": 3.84455420258667e-09, + "loss": 0.8281, + "step": 2705 + }, + { + "epoch": 0.991572004397215, + "grad_norm": 2.4950497150421143, + "learning_rate": 3.5308677211798313e-09, + "loss": 0.8054, + "step": 2706 + }, + { + "epoch": 0.9919384389886405, + "grad_norm": 2.262019157409668, + "learning_rate": 3.2305265265608622e-09, + "loss": 0.7843, + "step": 2707 + }, + { + "epoch": 0.9923048735800659, + "grad_norm": 2.4214706420898438, + "learning_rate": 2.943531019686807e-09, + "loss": 0.8217, + "step": 2708 + }, + { + "epoch": 0.9926713081714914, + "grad_norm": 2.6333794593811035, + "learning_rate": 2.669881583696743e-09, + "loss": 0.7976, + "step": 2709 + }, + { + "epoch": 0.9930377427629168, + "grad_norm": 2.6949565410614014, + "learning_rate": 2.409578583912886e-09, + "loss": 0.813, + "step": 2710 + }, + { + "epoch": 0.9934041773543423, + "grad_norm": 2.509366750717163, + "learning_rate": 2.162622367839484e-09, + "loss": 0.7697, + "step": 2711 + }, + { + "epoch": 0.9937706119457677, + "grad_norm": 2.343609094619751, + "learning_rate": 1.9290132651628157e-09, + "loss": 0.781, + "step": 2712 + }, + { + "epoch": 0.9941370465371931, + "grad_norm": 2.2084622383117676, + "learning_rate": 1.7087515877511895e-09, + "loss": 0.8529, + "step": 2713 + }, + { + "epoch": 0.9945034811286185, + "grad_norm": 2.560805320739746, + "learning_rate": 1.501837629653835e-09, + "loss": 0.7243, + "step": 2714 + }, + { + "epoch": 0.994869915720044, + "grad_norm": 2.592674970626831, + "learning_rate": 1.3082716671020123e-09, + "loss": 0.8383, + "step": 2715 + }, + { + "epoch": 0.9952363503114694, + "grad_norm": 2.773082733154297, + "learning_rate": 1.1280539585045713e-09, + "loss": 0.8462, + "step": 2716 + }, + { + "epoch": 0.9956027849028949, + "grad_norm": 2.683271646499634, + "learning_rate": 9.611847444523925e-10, + "loss": 0.7387, + "step": 2717 + }, + { + "epoch": 0.9959692194943203, + "grad_norm": 3.2599453926086426, + "learning_rate": 8.076642477172769e-10, + "loss": 0.7307, + "step": 2718 + }, + { + "epoch": 0.9963356540857456, + "grad_norm": 2.323328971862793, + "learning_rate": 6.674926732475051e-10, + "loss": 0.8463, + "step": 2719 + }, + { + "epoch": 0.9967020886771711, + "grad_norm": 2.6469409465789795, + "learning_rate": 5.406702081733883e-10, + "loss": 0.7977, + "step": 2720 + }, + { + "epoch": 0.9970685232685965, + "grad_norm": 2.4954774379730225, + "learning_rate": 4.2719702180282763e-10, + "loss": 0.8084, + "step": 2721 + }, + { + "epoch": 0.997434957860022, + "grad_norm": 2.8149211406707764, + "learning_rate": 3.270732656213138e-10, + "loss": 0.8145, + "step": 2722 + }, + { + "epoch": 0.9978013924514474, + "grad_norm": 2.791076421737671, + "learning_rate": 2.4029907329525815e-10, + "loss": 0.7137, + "step": 2723 + }, + { + "epoch": 0.9981678270428729, + "grad_norm": 2.6285765171051025, + "learning_rate": 1.668745606675515e-10, + "loss": 0.7838, + "step": 2724 + }, + { + "epoch": 0.9985342616342983, + "grad_norm": 2.47336483001709, + "learning_rate": 1.0679982576089487e-10, + "loss": 0.813, + "step": 2725 + }, + { + "epoch": 0.9989006962257237, + "grad_norm": 2.5160090923309326, + "learning_rate": 6.007494877335873e-11, + "loss": 0.7567, + "step": 2726 + }, + { + "epoch": 0.9992671308171491, + "grad_norm": 2.384516954421997, + "learning_rate": 2.6699992085044146e-11, + "loss": 0.8212, + "step": 2727 + }, + { + "epoch": 0.9996335654085746, + "grad_norm": 3.125704050064087, + "learning_rate": 6.675000249201091e-12, + "loss": 0.7671, + "step": 2728 + }, + { + "epoch": 1.0, + "grad_norm": 2.552747964859009, + "learning_rate": 0.0, + "loss": 0.7764, + "step": 2729 + } + ], + "logging_steps": 1, + "max_steps": 2729, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.426020372786971e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}